diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,56034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.233889434085705, + "eval_steps": 500, + "global_step": 80000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006542361792607131, + "grad_norm": 2.3467204570770264, + "learning_rate": 1.0000000000000001e-07, + "loss": 0.208, + "step": 10 + }, + { + "epoch": 0.0013084723585214263, + "grad_norm": 2.2877755165100098, + "learning_rate": 2.111111111111111e-07, + "loss": 0.2277, + "step": 20 + }, + { + "epoch": 0.001962708537782139, + "grad_norm": 2.4381093978881836, + "learning_rate": 3.222222222222222e-07, + "loss": 0.2252, + "step": 30 + }, + { + "epoch": 0.0026169447170428526, + "grad_norm": 2.922689914703369, + "learning_rate": 4.3333333333333335e-07, + "loss": 0.2161, + "step": 40 + }, + { + "epoch": 0.0032711808963035655, + "grad_norm": 2.4064548015594482, + "learning_rate": 5.444444444444444e-07, + "loss": 0.2299, + "step": 50 + }, + { + "epoch": 0.003925417075564278, + "grad_norm": 2.436875581741333, + "learning_rate": 6.555555555555556e-07, + "loss": 0.2218, + "step": 60 + }, + { + "epoch": 0.004579653254824992, + "grad_norm": 2.463653802871704, + "learning_rate": 7.666666666666667e-07, + "loss": 0.2234, + "step": 70 + }, + { + "epoch": 0.005233889434085705, + "grad_norm": 1.9082249402999878, + "learning_rate": 8.777777777777779e-07, + "loss": 0.2155, + "step": 80 + }, + { + "epoch": 0.005888125613346418, + "grad_norm": 1.9711594581604004, + "learning_rate": 9.888888888888888e-07, + "loss": 0.2167, + "step": 90 + }, + { + "epoch": 0.006542361792607131, + "grad_norm": 2.022458076477051, + "learning_rate": 1.1e-06, + "loss": 0.2089, + "step": 100 + }, + { + "epoch": 0.007196597971867844, + "grad_norm": 2.3458659648895264, + "learning_rate": 1.2111111111111111e-06, + "loss": 0.2131, + "step": 110 + }, + { + "epoch": 0.007850834151128557, + "grad_norm": 2.2194032669067383, + "learning_rate": 1.3222222222222222e-06, + "loss": 0.2217, + "step": 120 + }, + { + "epoch": 0.00850507033038927, + "grad_norm": 2.2126388549804688, + "learning_rate": 1.4333333333333333e-06, + "loss": 0.2191, + "step": 130 + }, + { + "epoch": 0.009159306509649984, + "grad_norm": 2.212867021560669, + "learning_rate": 1.5444444444444446e-06, + "loss": 0.2201, + "step": 140 + }, + { + "epoch": 0.009813542688910697, + "grad_norm": 2.3964221477508545, + "learning_rate": 1.6555555555555557e-06, + "loss": 0.2154, + "step": 150 + }, + { + "epoch": 0.01046777886817141, + "grad_norm": 2.0090246200561523, + "learning_rate": 1.7666666666666668e-06, + "loss": 0.2154, + "step": 160 + }, + { + "epoch": 0.011122015047432123, + "grad_norm": 2.2598977088928223, + "learning_rate": 1.877777777777778e-06, + "loss": 0.2057, + "step": 170 + }, + { + "epoch": 0.011776251226692836, + "grad_norm": 2.0792648792266846, + "learning_rate": 1.988888888888889e-06, + "loss": 0.1998, + "step": 180 + }, + { + "epoch": 0.012430487405953549, + "grad_norm": 2.0464093685150146, + "learning_rate": 2.1000000000000002e-06, + "loss": 0.1958, + "step": 190 + }, + { + "epoch": 0.013084723585214262, + "grad_norm": 1.8751633167266846, + "learning_rate": 2.2111111111111113e-06, + "loss": 0.1874, + "step": 200 + }, + { + "epoch": 0.013738959764474975, + "grad_norm": 1.9019556045532227, + "learning_rate": 2.322222222222222e-06, + "loss": 0.191, + "step": 210 + }, + { + "epoch": 0.014393195943735688, + "grad_norm": 1.7033251523971558, + "learning_rate": 2.4333333333333335e-06, + "loss": 0.1798, + "step": 220 + }, + { + "epoch": 0.015047432122996402, + "grad_norm": 1.5006437301635742, + "learning_rate": 2.5444444444444446e-06, + "loss": 0.1863, + "step": 230 + }, + { + "epoch": 0.015701668302257114, + "grad_norm": 1.7847758531570435, + "learning_rate": 2.6555555555555556e-06, + "loss": 0.1897, + "step": 240 + }, + { + "epoch": 0.016355904481517827, + "grad_norm": 2.0124351978302, + "learning_rate": 2.7666666666666667e-06, + "loss": 0.1816, + "step": 250 + }, + { + "epoch": 0.01701014066077854, + "grad_norm": 1.3791583776474, + "learning_rate": 2.877777777777778e-06, + "loss": 0.1895, + "step": 260 + }, + { + "epoch": 0.017664376840039256, + "grad_norm": 1.6091564893722534, + "learning_rate": 2.988888888888889e-06, + "loss": 0.174, + "step": 270 + }, + { + "epoch": 0.01831861301929997, + "grad_norm": 1.2440049648284912, + "learning_rate": 3.1e-06, + "loss": 0.1737, + "step": 280 + }, + { + "epoch": 0.018972849198560682, + "grad_norm": 1.4476827383041382, + "learning_rate": 3.2111111111111115e-06, + "loss": 0.1656, + "step": 290 + }, + { + "epoch": 0.019627085377821395, + "grad_norm": 1.3262630701065063, + "learning_rate": 3.3222222222222226e-06, + "loss": 0.1588, + "step": 300 + }, + { + "epoch": 0.020281321557082108, + "grad_norm": 1.3778283596038818, + "learning_rate": 3.4333333333333336e-06, + "loss": 0.1677, + "step": 310 + }, + { + "epoch": 0.02093555773634282, + "grad_norm": 1.322625756263733, + "learning_rate": 3.5444444444444447e-06, + "loss": 0.1776, + "step": 320 + }, + { + "epoch": 0.021589793915603533, + "grad_norm": 1.2029178142547607, + "learning_rate": 3.655555555555556e-06, + "loss": 0.1759, + "step": 330 + }, + { + "epoch": 0.022244030094864246, + "grad_norm": 1.393506646156311, + "learning_rate": 3.766666666666667e-06, + "loss": 0.1781, + "step": 340 + }, + { + "epoch": 0.02289826627412496, + "grad_norm": 1.2977313995361328, + "learning_rate": 3.877777777777778e-06, + "loss": 0.1674, + "step": 350 + }, + { + "epoch": 0.023552502453385672, + "grad_norm": 1.1824816465377808, + "learning_rate": 3.9888888888888895e-06, + "loss": 0.165, + "step": 360 + }, + { + "epoch": 0.024206738632646385, + "grad_norm": 1.364259123802185, + "learning_rate": 4.1000000000000006e-06, + "loss": 0.1652, + "step": 370 + }, + { + "epoch": 0.024860974811907098, + "grad_norm": 1.3480299711227417, + "learning_rate": 4.211111111111112e-06, + "loss": 0.156, + "step": 380 + }, + { + "epoch": 0.02551521099116781, + "grad_norm": 1.1253418922424316, + "learning_rate": 4.322222222222223e-06, + "loss": 0.1666, + "step": 390 + }, + { + "epoch": 0.026169447170428524, + "grad_norm": 0.9541553258895874, + "learning_rate": 4.433333333333334e-06, + "loss": 0.1679, + "step": 400 + }, + { + "epoch": 0.026823683349689237, + "grad_norm": 1.3034117221832275, + "learning_rate": 4.544444444444445e-06, + "loss": 0.167, + "step": 410 + }, + { + "epoch": 0.02747791952894995, + "grad_norm": 1.1535122394561768, + "learning_rate": 4.655555555555556e-06, + "loss": 0.1562, + "step": 420 + }, + { + "epoch": 0.028132155708210663, + "grad_norm": 1.2631272077560425, + "learning_rate": 4.766666666666667e-06, + "loss": 0.1656, + "step": 430 + }, + { + "epoch": 0.028786391887471376, + "grad_norm": 0.9639325737953186, + "learning_rate": 4.877777777777778e-06, + "loss": 0.1696, + "step": 440 + }, + { + "epoch": 0.029440628066732092, + "grad_norm": 1.205918550491333, + "learning_rate": 4.988888888888889e-06, + "loss": 0.161, + "step": 450 + }, + { + "epoch": 0.030094864245992805, + "grad_norm": 1.0042939186096191, + "learning_rate": 5.1e-06, + "loss": 0.1651, + "step": 460 + }, + { + "epoch": 0.030749100425253518, + "grad_norm": 1.0380401611328125, + "learning_rate": 5.211111111111111e-06, + "loss": 0.162, + "step": 470 + }, + { + "epoch": 0.03140333660451423, + "grad_norm": 0.9904937148094177, + "learning_rate": 5.3222222222222225e-06, + "loss": 0.1663, + "step": 480 + }, + { + "epoch": 0.03205757278377494, + "grad_norm": 1.1420738697052002, + "learning_rate": 5.4333333333333335e-06, + "loss": 0.1566, + "step": 490 + }, + { + "epoch": 0.03271180896303565, + "grad_norm": 1.0128347873687744, + "learning_rate": 5.544444444444445e-06, + "loss": 0.1576, + "step": 500 + }, + { + "epoch": 0.033366045142296366, + "grad_norm": 0.9660897254943848, + "learning_rate": 5.655555555555556e-06, + "loss": 0.1549, + "step": 510 + }, + { + "epoch": 0.03402028132155708, + "grad_norm": 1.0083342790603638, + "learning_rate": 5.766666666666667e-06, + "loss": 0.1575, + "step": 520 + }, + { + "epoch": 0.03467451750081779, + "grad_norm": 1.092995047569275, + "learning_rate": 5.877777777777778e-06, + "loss": 0.162, + "step": 530 + }, + { + "epoch": 0.03532875368007851, + "grad_norm": 0.9964866638183594, + "learning_rate": 5.988888888888889e-06, + "loss": 0.1518, + "step": 540 + }, + { + "epoch": 0.035982989859339225, + "grad_norm": 1.2313237190246582, + "learning_rate": 6.1e-06, + "loss": 0.151, + "step": 550 + }, + { + "epoch": 0.03663722603859994, + "grad_norm": 1.1173750162124634, + "learning_rate": 6.211111111111111e-06, + "loss": 0.1651, + "step": 560 + }, + { + "epoch": 0.03729146221786065, + "grad_norm": 1.0656248331069946, + "learning_rate": 6.322222222222222e-06, + "loss": 0.151, + "step": 570 + }, + { + "epoch": 0.037945698397121363, + "grad_norm": 1.1378906965255737, + "learning_rate": 6.433333333333334e-06, + "loss": 0.1666, + "step": 580 + }, + { + "epoch": 0.038599934576382076, + "grad_norm": 1.271768569946289, + "learning_rate": 6.544444444444444e-06, + "loss": 0.1557, + "step": 590 + }, + { + "epoch": 0.03925417075564279, + "grad_norm": 1.0535553693771362, + "learning_rate": 6.655555555555556e-06, + "loss": 0.1423, + "step": 600 + }, + { + "epoch": 0.0399084069349035, + "grad_norm": 1.0603430271148682, + "learning_rate": 6.766666666666667e-06, + "loss": 0.1556, + "step": 610 + }, + { + "epoch": 0.040562643114164215, + "grad_norm": 3.6667890548706055, + "learning_rate": 6.877777777777778e-06, + "loss": 0.1488, + "step": 620 + }, + { + "epoch": 0.04121687929342493, + "grad_norm": 0.9498468041419983, + "learning_rate": 6.9888888888888895e-06, + "loss": 0.155, + "step": 630 + }, + { + "epoch": 0.04187111547268564, + "grad_norm": 1.061012625694275, + "learning_rate": 7.1e-06, + "loss": 0.1559, + "step": 640 + }, + { + "epoch": 0.042525351651946354, + "grad_norm": 1.0795965194702148, + "learning_rate": 7.211111111111112e-06, + "loss": 0.1494, + "step": 650 + }, + { + "epoch": 0.04317958783120707, + "grad_norm": 0.8958123922348022, + "learning_rate": 7.322222222222222e-06, + "loss": 0.1435, + "step": 660 + }, + { + "epoch": 0.04383382401046778, + "grad_norm": 0.9391831755638123, + "learning_rate": 7.433333333333334e-06, + "loss": 0.1466, + "step": 670 + }, + { + "epoch": 0.04448806018972849, + "grad_norm": 1.4169477224349976, + "learning_rate": 7.544444444444444e-06, + "loss": 0.1394, + "step": 680 + }, + { + "epoch": 0.045142296368989206, + "grad_norm": 1.0921765565872192, + "learning_rate": 7.655555555555556e-06, + "loss": 0.1522, + "step": 690 + }, + { + "epoch": 0.04579653254824992, + "grad_norm": 0.8971364498138428, + "learning_rate": 7.766666666666666e-06, + "loss": 0.1567, + "step": 700 + }, + { + "epoch": 0.04645076872751063, + "grad_norm": 0.9771297574043274, + "learning_rate": 7.877777777777778e-06, + "loss": 0.1562, + "step": 710 + }, + { + "epoch": 0.047105004906771344, + "grad_norm": 1.0776768922805786, + "learning_rate": 7.988888888888888e-06, + "loss": 0.1485, + "step": 720 + }, + { + "epoch": 0.04775924108603206, + "grad_norm": 1.0369359254837036, + "learning_rate": 8.1e-06, + "loss": 0.1524, + "step": 730 + }, + { + "epoch": 0.04841347726529277, + "grad_norm": 1.057263731956482, + "learning_rate": 8.21111111111111e-06, + "loss": 0.1542, + "step": 740 + }, + { + "epoch": 0.04906771344455348, + "grad_norm": 1.065109372138977, + "learning_rate": 8.322222222222223e-06, + "loss": 0.1539, + "step": 750 + }, + { + "epoch": 0.049721949623814196, + "grad_norm": 1.0507162809371948, + "learning_rate": 8.433333333333333e-06, + "loss": 0.1383, + "step": 760 + }, + { + "epoch": 0.05037618580307491, + "grad_norm": 0.9928821325302124, + "learning_rate": 8.544444444444445e-06, + "loss": 0.1459, + "step": 770 + }, + { + "epoch": 0.05103042198233562, + "grad_norm": 1.1054047346115112, + "learning_rate": 8.655555555555555e-06, + "loss": 0.1455, + "step": 780 + }, + { + "epoch": 0.051684658161596335, + "grad_norm": 0.8906415104866028, + "learning_rate": 8.766666666666667e-06, + "loss": 0.1351, + "step": 790 + }, + { + "epoch": 0.05233889434085705, + "grad_norm": 0.891897439956665, + "learning_rate": 8.877777777777777e-06, + "loss": 0.1536, + "step": 800 + }, + { + "epoch": 0.05299313052011776, + "grad_norm": 0.8980157971382141, + "learning_rate": 8.988888888888889e-06, + "loss": 0.1366, + "step": 810 + }, + { + "epoch": 0.053647366699378474, + "grad_norm": 0.9560266733169556, + "learning_rate": 9.100000000000001e-06, + "loss": 0.1432, + "step": 820 + }, + { + "epoch": 0.05430160287863919, + "grad_norm": 0.9071044921875, + "learning_rate": 9.211111111111111e-06, + "loss": 0.1437, + "step": 830 + }, + { + "epoch": 0.0549558390578999, + "grad_norm": 1.1102005243301392, + "learning_rate": 9.322222222222223e-06, + "loss": 0.1446, + "step": 840 + }, + { + "epoch": 0.05561007523716061, + "grad_norm": 0.9674533605575562, + "learning_rate": 9.433333333333335e-06, + "loss": 0.1534, + "step": 850 + }, + { + "epoch": 0.056264311416421325, + "grad_norm": 0.8930101990699768, + "learning_rate": 9.544444444444445e-06, + "loss": 0.154, + "step": 860 + }, + { + "epoch": 0.05691854759568204, + "grad_norm": 1.0106406211853027, + "learning_rate": 9.655555555555557e-06, + "loss": 0.1438, + "step": 870 + }, + { + "epoch": 0.05757278377494275, + "grad_norm": 1.0581188201904297, + "learning_rate": 9.766666666666667e-06, + "loss": 0.1465, + "step": 880 + }, + { + "epoch": 0.058227019954203464, + "grad_norm": 1.138856291770935, + "learning_rate": 9.87777777777778e-06, + "loss": 0.1439, + "step": 890 + }, + { + "epoch": 0.058881256133464184, + "grad_norm": 1.0272846221923828, + "learning_rate": 9.98888888888889e-06, + "loss": 0.1627, + "step": 900 + }, + { + "epoch": 0.0595354923127249, + "grad_norm": 1.071536898612976, + "learning_rate": 1.0100000000000002e-05, + "loss": 0.1544, + "step": 910 + }, + { + "epoch": 0.06018972849198561, + "grad_norm": 0.9497654438018799, + "learning_rate": 1.0211111111111112e-05, + "loss": 0.1433, + "step": 920 + }, + { + "epoch": 0.06084396467124632, + "grad_norm": 0.9319833517074585, + "learning_rate": 1.0322222222222224e-05, + "loss": 0.151, + "step": 930 + }, + { + "epoch": 0.061498200850507036, + "grad_norm": 0.8812747001647949, + "learning_rate": 1.0433333333333334e-05, + "loss": 0.1307, + "step": 940 + }, + { + "epoch": 0.06215243702976775, + "grad_norm": 0.9672985672950745, + "learning_rate": 1.0544444444444444e-05, + "loss": 0.1453, + "step": 950 + }, + { + "epoch": 0.06280667320902845, + "grad_norm": 0.9534893035888672, + "learning_rate": 1.0655555555555556e-05, + "loss": 0.1365, + "step": 960 + }, + { + "epoch": 0.06346090938828917, + "grad_norm": 0.9244397282600403, + "learning_rate": 1.0766666666666666e-05, + "loss": 0.1472, + "step": 970 + }, + { + "epoch": 0.06411514556754988, + "grad_norm": 0.9510551691055298, + "learning_rate": 1.0877777777777778e-05, + "loss": 0.1315, + "step": 980 + }, + { + "epoch": 0.0647693817468106, + "grad_norm": 1.0477503538131714, + "learning_rate": 1.0988888888888889e-05, + "loss": 0.137, + "step": 990 + }, + { + "epoch": 0.0654236179260713, + "grad_norm": 0.7996601462364197, + "learning_rate": 1.11e-05, + "loss": 0.1334, + "step": 1000 + }, + { + "epoch": 0.06607785410533203, + "grad_norm": 0.9080948829650879, + "learning_rate": 1.121111111111111e-05, + "loss": 0.1443, + "step": 1010 + }, + { + "epoch": 0.06673209028459273, + "grad_norm": 1.1308685541152954, + "learning_rate": 1.1322222222222223e-05, + "loss": 0.1424, + "step": 1020 + }, + { + "epoch": 0.06738632646385345, + "grad_norm": 0.9781653881072998, + "learning_rate": 1.1433333333333333e-05, + "loss": 0.1332, + "step": 1030 + }, + { + "epoch": 0.06804056264311416, + "grad_norm": 0.9940928220748901, + "learning_rate": 1.1544444444444445e-05, + "loss": 0.1449, + "step": 1040 + }, + { + "epoch": 0.06869479882237488, + "grad_norm": 1.109440803527832, + "learning_rate": 1.1655555555555555e-05, + "loss": 0.1373, + "step": 1050 + }, + { + "epoch": 0.06934903500163558, + "grad_norm": 0.7790347337722778, + "learning_rate": 1.1766666666666667e-05, + "loss": 0.1441, + "step": 1060 + }, + { + "epoch": 0.0700032711808963, + "grad_norm": 0.9548224210739136, + "learning_rate": 1.1877777777777777e-05, + "loss": 0.1512, + "step": 1070 + }, + { + "epoch": 0.07065750736015702, + "grad_norm": 0.9746776223182678, + "learning_rate": 1.1988888888888889e-05, + "loss": 0.1412, + "step": 1080 + }, + { + "epoch": 0.07131174353941773, + "grad_norm": 1.2580838203430176, + "learning_rate": 1.2100000000000001e-05, + "loss": 0.153, + "step": 1090 + }, + { + "epoch": 0.07196597971867845, + "grad_norm": 1.0998780727386475, + "learning_rate": 1.2211111111111111e-05, + "loss": 0.1396, + "step": 1100 + }, + { + "epoch": 0.07262021589793916, + "grad_norm": 0.9261326789855957, + "learning_rate": 1.2322222222222223e-05, + "loss": 0.1431, + "step": 1110 + }, + { + "epoch": 0.07327445207719988, + "grad_norm": 0.9783452749252319, + "learning_rate": 1.2433333333333335e-05, + "loss": 0.147, + "step": 1120 + }, + { + "epoch": 0.07392868825646058, + "grad_norm": 0.9785774946212769, + "learning_rate": 1.2544444444444445e-05, + "loss": 0.1418, + "step": 1130 + }, + { + "epoch": 0.0745829244357213, + "grad_norm": 1.0473101139068604, + "learning_rate": 1.2655555555555557e-05, + "loss": 0.1397, + "step": 1140 + }, + { + "epoch": 0.07523716061498201, + "grad_norm": 0.972935140132904, + "learning_rate": 1.276666666666667e-05, + "loss": 0.1394, + "step": 1150 + }, + { + "epoch": 0.07589139679424273, + "grad_norm": 0.9036497473716736, + "learning_rate": 1.2877777777777778e-05, + "loss": 0.1465, + "step": 1160 + }, + { + "epoch": 0.07654563297350343, + "grad_norm": 1.1967897415161133, + "learning_rate": 1.298888888888889e-05, + "loss": 0.1332, + "step": 1170 + }, + { + "epoch": 0.07719986915276415, + "grad_norm": 0.875632643699646, + "learning_rate": 1.3100000000000002e-05, + "loss": 0.14, + "step": 1180 + }, + { + "epoch": 0.07785410533202486, + "grad_norm": 0.9649753570556641, + "learning_rate": 1.3211111111111114e-05, + "loss": 0.1324, + "step": 1190 + }, + { + "epoch": 0.07850834151128558, + "grad_norm": 0.8261013627052307, + "learning_rate": 1.3322222222222222e-05, + "loss": 0.1333, + "step": 1200 + }, + { + "epoch": 0.07916257769054628, + "grad_norm": 1.0080054998397827, + "learning_rate": 1.3433333333333334e-05, + "loss": 0.1367, + "step": 1210 + }, + { + "epoch": 0.079816813869807, + "grad_norm": 1.0562273263931274, + "learning_rate": 1.3544444444444446e-05, + "loss": 0.1329, + "step": 1220 + }, + { + "epoch": 0.08047105004906771, + "grad_norm": 0.9354407787322998, + "learning_rate": 1.3655555555555558e-05, + "loss": 0.134, + "step": 1230 + }, + { + "epoch": 0.08112528622832843, + "grad_norm": 0.9830518364906311, + "learning_rate": 1.3766666666666666e-05, + "loss": 0.135, + "step": 1240 + }, + { + "epoch": 0.08177952240758914, + "grad_norm": 0.951457679271698, + "learning_rate": 1.3877777777777778e-05, + "loss": 0.1339, + "step": 1250 + }, + { + "epoch": 0.08243375858684986, + "grad_norm": 0.887077271938324, + "learning_rate": 1.398888888888889e-05, + "loss": 0.1436, + "step": 1260 + }, + { + "epoch": 0.08308799476611056, + "grad_norm": 0.9000831842422485, + "learning_rate": 1.4099999999999999e-05, + "loss": 0.1367, + "step": 1270 + }, + { + "epoch": 0.08374223094537128, + "grad_norm": 1.0145295858383179, + "learning_rate": 1.421111111111111e-05, + "loss": 0.1491, + "step": 1280 + }, + { + "epoch": 0.08439646712463199, + "grad_norm": 0.8529645800590515, + "learning_rate": 1.4322222222222223e-05, + "loss": 0.1278, + "step": 1290 + }, + { + "epoch": 0.08505070330389271, + "grad_norm": 1.141903042793274, + "learning_rate": 1.4433333333333335e-05, + "loss": 0.1319, + "step": 1300 + }, + { + "epoch": 0.08570493948315341, + "grad_norm": 1.0339152812957764, + "learning_rate": 1.4544444444444443e-05, + "loss": 0.1362, + "step": 1310 + }, + { + "epoch": 0.08635917566241413, + "grad_norm": 0.9347769021987915, + "learning_rate": 1.4655555555555555e-05, + "loss": 0.1382, + "step": 1320 + }, + { + "epoch": 0.08701341184167484, + "grad_norm": 1.0565035343170166, + "learning_rate": 1.4766666666666667e-05, + "loss": 0.1321, + "step": 1330 + }, + { + "epoch": 0.08766764802093556, + "grad_norm": 0.9495023488998413, + "learning_rate": 1.4877777777777779e-05, + "loss": 0.1486, + "step": 1340 + }, + { + "epoch": 0.08832188420019627, + "grad_norm": 0.9335034489631653, + "learning_rate": 1.498888888888889e-05, + "loss": 0.1413, + "step": 1350 + }, + { + "epoch": 0.08897612037945699, + "grad_norm": 0.8525314331054688, + "learning_rate": 1.51e-05, + "loss": 0.1265, + "step": 1360 + }, + { + "epoch": 0.08963035655871769, + "grad_norm": 1.1363369226455688, + "learning_rate": 1.5211111111111111e-05, + "loss": 0.1397, + "step": 1370 + }, + { + "epoch": 0.09028459273797841, + "grad_norm": 0.892582893371582, + "learning_rate": 1.5322222222222225e-05, + "loss": 0.1314, + "step": 1380 + }, + { + "epoch": 0.09093882891723912, + "grad_norm": 1.058522343635559, + "learning_rate": 1.5433333333333334e-05, + "loss": 0.1356, + "step": 1390 + }, + { + "epoch": 0.09159306509649984, + "grad_norm": 0.9127554297447205, + "learning_rate": 1.5544444444444445e-05, + "loss": 0.1368, + "step": 1400 + }, + { + "epoch": 0.09224730127576054, + "grad_norm": 1.5529781579971313, + "learning_rate": 1.5655555555555557e-05, + "loss": 0.1406, + "step": 1410 + }, + { + "epoch": 0.09290153745502126, + "grad_norm": 0.9582400321960449, + "learning_rate": 1.576666666666667e-05, + "loss": 0.1315, + "step": 1420 + }, + { + "epoch": 0.09355577363428197, + "grad_norm": 0.9931496977806091, + "learning_rate": 1.5877777777777778e-05, + "loss": 0.1327, + "step": 1430 + }, + { + "epoch": 0.09421000981354269, + "grad_norm": 0.9021315574645996, + "learning_rate": 1.598888888888889e-05, + "loss": 0.1335, + "step": 1440 + }, + { + "epoch": 0.09486424599280341, + "grad_norm": 0.9401276707649231, + "learning_rate": 1.6100000000000002e-05, + "loss": 0.1272, + "step": 1450 + }, + { + "epoch": 0.09551848217206411, + "grad_norm": 0.9633158445358276, + "learning_rate": 1.6211111111111114e-05, + "loss": 0.1359, + "step": 1460 + }, + { + "epoch": 0.09617271835132483, + "grad_norm": 1.0396912097930908, + "learning_rate": 1.6322222222222222e-05, + "loss": 0.1298, + "step": 1470 + }, + { + "epoch": 0.09682695453058554, + "grad_norm": 0.9414869546890259, + "learning_rate": 1.6433333333333334e-05, + "loss": 0.1288, + "step": 1480 + }, + { + "epoch": 0.09748119070984626, + "grad_norm": 0.8435006737709045, + "learning_rate": 1.6544444444444446e-05, + "loss": 0.1303, + "step": 1490 + }, + { + "epoch": 0.09813542688910697, + "grad_norm": 0.9831594824790955, + "learning_rate": 1.6655555555555558e-05, + "loss": 0.1439, + "step": 1500 + }, + { + "epoch": 0.09878966306836769, + "grad_norm": 1.0466912984848022, + "learning_rate": 1.6766666666666667e-05, + "loss": 0.1277, + "step": 1510 + }, + { + "epoch": 0.09944389924762839, + "grad_norm": 0.9995728731155396, + "learning_rate": 1.687777777777778e-05, + "loss": 0.1297, + "step": 1520 + }, + { + "epoch": 0.10009813542688911, + "grad_norm": 1.018913745880127, + "learning_rate": 1.698888888888889e-05, + "loss": 0.1341, + "step": 1530 + }, + { + "epoch": 0.10075237160614982, + "grad_norm": 1.1046700477600098, + "learning_rate": 1.7100000000000002e-05, + "loss": 0.1332, + "step": 1540 + }, + { + "epoch": 0.10140660778541054, + "grad_norm": 1.0834091901779175, + "learning_rate": 1.721111111111111e-05, + "loss": 0.1349, + "step": 1550 + }, + { + "epoch": 0.10206084396467124, + "grad_norm": 1.079143762588501, + "learning_rate": 1.7322222222222223e-05, + "loss": 0.1398, + "step": 1560 + }, + { + "epoch": 0.10271508014393196, + "grad_norm": 1.0492314100265503, + "learning_rate": 1.7433333333333335e-05, + "loss": 0.1411, + "step": 1570 + }, + { + "epoch": 0.10336931632319267, + "grad_norm": 1.171302318572998, + "learning_rate": 1.7544444444444443e-05, + "loss": 0.1312, + "step": 1580 + }, + { + "epoch": 0.10402355250245339, + "grad_norm": 1.0378974676132202, + "learning_rate": 1.7655555555555555e-05, + "loss": 0.1254, + "step": 1590 + }, + { + "epoch": 0.1046777886817141, + "grad_norm": 0.9647935032844543, + "learning_rate": 1.7766666666666667e-05, + "loss": 0.1444, + "step": 1600 + }, + { + "epoch": 0.10533202486097482, + "grad_norm": 0.9934017658233643, + "learning_rate": 1.787777777777778e-05, + "loss": 0.1344, + "step": 1610 + }, + { + "epoch": 0.10598626104023552, + "grad_norm": 1.0105055570602417, + "learning_rate": 1.7988888888888888e-05, + "loss": 0.1397, + "step": 1620 + }, + { + "epoch": 0.10664049721949624, + "grad_norm": 1.05049729347229, + "learning_rate": 1.81e-05, + "loss": 0.1348, + "step": 1630 + }, + { + "epoch": 0.10729473339875695, + "grad_norm": 1.0454671382904053, + "learning_rate": 1.821111111111111e-05, + "loss": 0.1239, + "step": 1640 + }, + { + "epoch": 0.10794896957801767, + "grad_norm": 1.0142239332199097, + "learning_rate": 1.8322222222222223e-05, + "loss": 0.1148, + "step": 1650 + }, + { + "epoch": 0.10860320575727837, + "grad_norm": 1.1496508121490479, + "learning_rate": 1.8433333333333332e-05, + "loss": 0.1308, + "step": 1660 + }, + { + "epoch": 0.1092574419365391, + "grad_norm": 0.9551987051963806, + "learning_rate": 1.8544444444444444e-05, + "loss": 0.1339, + "step": 1670 + }, + { + "epoch": 0.1099116781157998, + "grad_norm": 0.8923566937446594, + "learning_rate": 1.8655555555555556e-05, + "loss": 0.1286, + "step": 1680 + }, + { + "epoch": 0.11056591429506052, + "grad_norm": 1.0510790348052979, + "learning_rate": 1.8766666666666668e-05, + "loss": 0.1362, + "step": 1690 + }, + { + "epoch": 0.11122015047432122, + "grad_norm": 0.9121619462966919, + "learning_rate": 1.8877777777777776e-05, + "loss": 0.1273, + "step": 1700 + }, + { + "epoch": 0.11187438665358194, + "grad_norm": 1.1024173498153687, + "learning_rate": 1.8988888888888888e-05, + "loss": 0.1253, + "step": 1710 + }, + { + "epoch": 0.11252862283284265, + "grad_norm": 1.032799482345581, + "learning_rate": 1.91e-05, + "loss": 0.1313, + "step": 1720 + }, + { + "epoch": 0.11318285901210337, + "grad_norm": 1.0584205389022827, + "learning_rate": 1.9211111111111112e-05, + "loss": 0.1342, + "step": 1730 + }, + { + "epoch": 0.11383709519136408, + "grad_norm": 0.8857603669166565, + "learning_rate": 1.932222222222222e-05, + "loss": 0.1308, + "step": 1740 + }, + { + "epoch": 0.1144913313706248, + "grad_norm": 0.9453005194664001, + "learning_rate": 1.9433333333333332e-05, + "loss": 0.1181, + "step": 1750 + }, + { + "epoch": 0.1151455675498855, + "grad_norm": 1.0007144212722778, + "learning_rate": 1.9544444444444444e-05, + "loss": 0.1233, + "step": 1760 + }, + { + "epoch": 0.11579980372914622, + "grad_norm": 1.0010242462158203, + "learning_rate": 1.9655555555555556e-05, + "loss": 0.1218, + "step": 1770 + }, + { + "epoch": 0.11645403990840693, + "grad_norm": 0.9568318128585815, + "learning_rate": 1.9766666666666668e-05, + "loss": 0.127, + "step": 1780 + }, + { + "epoch": 0.11710827608766765, + "grad_norm": 0.9792290925979614, + "learning_rate": 1.9877777777777777e-05, + "loss": 0.135, + "step": 1790 + }, + { + "epoch": 0.11776251226692837, + "grad_norm": 1.0170482397079468, + "learning_rate": 1.998888888888889e-05, + "loss": 0.1401, + "step": 1800 + }, + { + "epoch": 0.11841674844618907, + "grad_norm": 0.949272632598877, + "learning_rate": 2.01e-05, + "loss": 0.1319, + "step": 1810 + }, + { + "epoch": 0.1190709846254498, + "grad_norm": 0.8818572163581848, + "learning_rate": 2.0211111111111113e-05, + "loss": 0.1277, + "step": 1820 + }, + { + "epoch": 0.1197252208047105, + "grad_norm": 0.9168890118598938, + "learning_rate": 2.0322222222222225e-05, + "loss": 0.1168, + "step": 1830 + }, + { + "epoch": 0.12037945698397122, + "grad_norm": 1.180184006690979, + "learning_rate": 2.0433333333333336e-05, + "loss": 0.1281, + "step": 1840 + }, + { + "epoch": 0.12103369316323193, + "grad_norm": 1.0016313791275024, + "learning_rate": 2.054444444444445e-05, + "loss": 0.1351, + "step": 1850 + }, + { + "epoch": 0.12168792934249265, + "grad_norm": 0.9246931672096252, + "learning_rate": 2.0655555555555557e-05, + "loss": 0.1311, + "step": 1860 + }, + { + "epoch": 0.12234216552175335, + "grad_norm": 1.3594558238983154, + "learning_rate": 2.076666666666667e-05, + "loss": 0.1414, + "step": 1870 + }, + { + "epoch": 0.12299640170101407, + "grad_norm": 0.9774754643440247, + "learning_rate": 2.087777777777778e-05, + "loss": 0.1155, + "step": 1880 + }, + { + "epoch": 0.12365063788027478, + "grad_norm": 1.1299134492874146, + "learning_rate": 2.0988888888888893e-05, + "loss": 0.1167, + "step": 1890 + }, + { + "epoch": 0.1243048740595355, + "grad_norm": 0.9047524333000183, + "learning_rate": 2.11e-05, + "loss": 0.1306, + "step": 1900 + }, + { + "epoch": 0.1249591102387962, + "grad_norm": 1.080834150314331, + "learning_rate": 2.1211111111111113e-05, + "loss": 0.1234, + "step": 1910 + }, + { + "epoch": 0.1256133464180569, + "grad_norm": 0.8716378211975098, + "learning_rate": 2.1322222222222225e-05, + "loss": 0.1265, + "step": 1920 + }, + { + "epoch": 0.12626758259731763, + "grad_norm": 0.9985718727111816, + "learning_rate": 2.1433333333333334e-05, + "loss": 0.1284, + "step": 1930 + }, + { + "epoch": 0.12692181877657835, + "grad_norm": 1.0242950916290283, + "learning_rate": 2.1544444444444446e-05, + "loss": 0.1267, + "step": 1940 + }, + { + "epoch": 0.12757605495583907, + "grad_norm": 1.1003845930099487, + "learning_rate": 2.1655555555555558e-05, + "loss": 0.1273, + "step": 1950 + }, + { + "epoch": 0.12823029113509976, + "grad_norm": 0.8267995119094849, + "learning_rate": 2.176666666666667e-05, + "loss": 0.1261, + "step": 1960 + }, + { + "epoch": 0.12888452731436048, + "grad_norm": 1.2606829404830933, + "learning_rate": 2.1877777777777778e-05, + "loss": 0.1166, + "step": 1970 + }, + { + "epoch": 0.1295387634936212, + "grad_norm": 0.8288832306861877, + "learning_rate": 2.198888888888889e-05, + "loss": 0.1128, + "step": 1980 + }, + { + "epoch": 0.13019299967288192, + "grad_norm": 1.1182180643081665, + "learning_rate": 2.2100000000000002e-05, + "loss": 0.1281, + "step": 1990 + }, + { + "epoch": 0.1308472358521426, + "grad_norm": 0.8783948421478271, + "learning_rate": 2.2211111111111114e-05, + "loss": 0.1277, + "step": 2000 + }, + { + "epoch": 0.13150147203140333, + "grad_norm": 1.0937625169754028, + "learning_rate": 2.2322222222222222e-05, + "loss": 0.1294, + "step": 2010 + }, + { + "epoch": 0.13215570821066405, + "grad_norm": 0.9571072459220886, + "learning_rate": 2.2433333333333334e-05, + "loss": 0.1109, + "step": 2020 + }, + { + "epoch": 0.13280994438992477, + "grad_norm": 1.1977931261062622, + "learning_rate": 2.2544444444444446e-05, + "loss": 0.1331, + "step": 2030 + }, + { + "epoch": 0.13346418056918546, + "grad_norm": 0.9299104809761047, + "learning_rate": 2.2655555555555558e-05, + "loss": 0.1351, + "step": 2040 + }, + { + "epoch": 0.13411841674844618, + "grad_norm": 0.8753429651260376, + "learning_rate": 2.2766666666666667e-05, + "loss": 0.1365, + "step": 2050 + }, + { + "epoch": 0.1347726529277069, + "grad_norm": 1.0041842460632324, + "learning_rate": 2.287777777777778e-05, + "loss": 0.1303, + "step": 2060 + }, + { + "epoch": 0.13542688910696762, + "grad_norm": 1.1017142534255981, + "learning_rate": 2.298888888888889e-05, + "loss": 0.124, + "step": 2070 + }, + { + "epoch": 0.13608112528622832, + "grad_norm": 1.038955569267273, + "learning_rate": 2.3100000000000002e-05, + "loss": 0.1275, + "step": 2080 + }, + { + "epoch": 0.13673536146548904, + "grad_norm": 0.8835306763648987, + "learning_rate": 2.321111111111111e-05, + "loss": 0.1292, + "step": 2090 + }, + { + "epoch": 0.13738959764474976, + "grad_norm": 1.067949652671814, + "learning_rate": 2.3322222222222223e-05, + "loss": 0.1308, + "step": 2100 + }, + { + "epoch": 0.13804383382401048, + "grad_norm": 0.9632567167282104, + "learning_rate": 2.3433333333333335e-05, + "loss": 0.1389, + "step": 2110 + }, + { + "epoch": 0.13869807000327117, + "grad_norm": 0.9398028254508972, + "learning_rate": 2.3544444444444447e-05, + "loss": 0.1242, + "step": 2120 + }, + { + "epoch": 0.1393523061825319, + "grad_norm": 0.9722338914871216, + "learning_rate": 2.3655555555555555e-05, + "loss": 0.1179, + "step": 2130 + }, + { + "epoch": 0.1400065423617926, + "grad_norm": 0.8309763669967651, + "learning_rate": 2.3766666666666667e-05, + "loss": 0.1232, + "step": 2140 + }, + { + "epoch": 0.14066077854105333, + "grad_norm": 0.8105255961418152, + "learning_rate": 2.387777777777778e-05, + "loss": 0.1308, + "step": 2150 + }, + { + "epoch": 0.14131501472031405, + "grad_norm": 0.8585493564605713, + "learning_rate": 2.398888888888889e-05, + "loss": 0.1196, + "step": 2160 + }, + { + "epoch": 0.14196925089957474, + "grad_norm": 0.9327337145805359, + "learning_rate": 2.41e-05, + "loss": 0.1212, + "step": 2170 + }, + { + "epoch": 0.14262348707883546, + "grad_norm": 0.867504358291626, + "learning_rate": 2.421111111111111e-05, + "loss": 0.1131, + "step": 2180 + }, + { + "epoch": 0.14327772325809618, + "grad_norm": 0.8426870703697205, + "learning_rate": 2.4322222222222224e-05, + "loss": 0.1239, + "step": 2190 + }, + { + "epoch": 0.1439319594373569, + "grad_norm": 0.9085756540298462, + "learning_rate": 2.4433333333333335e-05, + "loss": 0.1288, + "step": 2200 + }, + { + "epoch": 0.1445861956166176, + "grad_norm": 0.9379426836967468, + "learning_rate": 2.4544444444444444e-05, + "loss": 0.1168, + "step": 2210 + }, + { + "epoch": 0.1452404317958783, + "grad_norm": 0.9728325009346008, + "learning_rate": 2.4655555555555556e-05, + "loss": 0.1237, + "step": 2220 + }, + { + "epoch": 0.14589466797513903, + "grad_norm": 0.9717603921890259, + "learning_rate": 2.4766666666666668e-05, + "loss": 0.1172, + "step": 2230 + }, + { + "epoch": 0.14654890415439975, + "grad_norm": 0.9177740812301636, + "learning_rate": 2.4877777777777776e-05, + "loss": 0.1172, + "step": 2240 + }, + { + "epoch": 0.14720314033366044, + "grad_norm": 1.4036056995391846, + "learning_rate": 2.498888888888889e-05, + "loss": 0.125, + "step": 2250 + }, + { + "epoch": 0.14785737651292116, + "grad_norm": 1.0242791175842285, + "learning_rate": 2.51e-05, + "loss": 0.1333, + "step": 2260 + }, + { + "epoch": 0.14851161269218188, + "grad_norm": 0.8376342058181763, + "learning_rate": 2.5211111111111112e-05, + "loss": 0.1227, + "step": 2270 + }, + { + "epoch": 0.1491658488714426, + "grad_norm": 0.9228038191795349, + "learning_rate": 2.5322222222222224e-05, + "loss": 0.1296, + "step": 2280 + }, + { + "epoch": 0.1498200850507033, + "grad_norm": 0.8739307522773743, + "learning_rate": 2.5433333333333336e-05, + "loss": 0.1086, + "step": 2290 + }, + { + "epoch": 0.15047432122996401, + "grad_norm": 0.9078662991523743, + "learning_rate": 2.5544444444444445e-05, + "loss": 0.1307, + "step": 2300 + }, + { + "epoch": 0.15112855740922473, + "grad_norm": 1.0658289194107056, + "learning_rate": 2.5655555555555557e-05, + "loss": 0.128, + "step": 2310 + }, + { + "epoch": 0.15178279358848545, + "grad_norm": 0.8755213618278503, + "learning_rate": 2.5766666666666665e-05, + "loss": 0.1305, + "step": 2320 + }, + { + "epoch": 0.15243702976774615, + "grad_norm": 1.0628185272216797, + "learning_rate": 2.5877777777777777e-05, + "loss": 0.1244, + "step": 2330 + }, + { + "epoch": 0.15309126594700687, + "grad_norm": 1.0583535432815552, + "learning_rate": 2.598888888888889e-05, + "loss": 0.1257, + "step": 2340 + }, + { + "epoch": 0.15374550212626759, + "grad_norm": 0.9537503719329834, + "learning_rate": 2.61e-05, + "loss": 0.1241, + "step": 2350 + }, + { + "epoch": 0.1543997383055283, + "grad_norm": 1.0985156297683716, + "learning_rate": 2.6211111111111113e-05, + "loss": 0.1359, + "step": 2360 + }, + { + "epoch": 0.155053974484789, + "grad_norm": 0.9252329468727112, + "learning_rate": 2.6322222222222225e-05, + "loss": 0.1317, + "step": 2370 + }, + { + "epoch": 0.15570821066404972, + "grad_norm": 1.1681864261627197, + "learning_rate": 2.6433333333333333e-05, + "loss": 0.1242, + "step": 2380 + }, + { + "epoch": 0.15636244684331044, + "grad_norm": 1.2066115140914917, + "learning_rate": 2.6544444444444445e-05, + "loss": 0.1152, + "step": 2390 + }, + { + "epoch": 0.15701668302257116, + "grad_norm": 1.1337898969650269, + "learning_rate": 2.6655555555555557e-05, + "loss": 0.1334, + "step": 2400 + }, + { + "epoch": 0.15767091920183185, + "grad_norm": 0.9998272657394409, + "learning_rate": 2.676666666666667e-05, + "loss": 0.1189, + "step": 2410 + }, + { + "epoch": 0.15832515538109257, + "grad_norm": 0.8952450752258301, + "learning_rate": 2.687777777777778e-05, + "loss": 0.1227, + "step": 2420 + }, + { + "epoch": 0.1589793915603533, + "grad_norm": 1.0793391466140747, + "learning_rate": 2.6988888888888893e-05, + "loss": 0.1198, + "step": 2430 + }, + { + "epoch": 0.159633627739614, + "grad_norm": 1.016587734222412, + "learning_rate": 2.7100000000000005e-05, + "loss": 0.1208, + "step": 2440 + }, + { + "epoch": 0.1602878639188747, + "grad_norm": 1.0260379314422607, + "learning_rate": 2.7211111111111113e-05, + "loss": 0.1258, + "step": 2450 + }, + { + "epoch": 0.16094210009813542, + "grad_norm": 0.9341796636581421, + "learning_rate": 2.7322222222222222e-05, + "loss": 0.1172, + "step": 2460 + }, + { + "epoch": 0.16159633627739614, + "grad_norm": 1.0528295040130615, + "learning_rate": 2.7433333333333334e-05, + "loss": 0.124, + "step": 2470 + }, + { + "epoch": 0.16225057245665686, + "grad_norm": 0.831092894077301, + "learning_rate": 2.7544444444444446e-05, + "loss": 0.1143, + "step": 2480 + }, + { + "epoch": 0.16290480863591755, + "grad_norm": 1.0132439136505127, + "learning_rate": 2.7655555555555558e-05, + "loss": 0.134, + "step": 2490 + }, + { + "epoch": 0.16355904481517827, + "grad_norm": 1.041785717010498, + "learning_rate": 2.776666666666667e-05, + "loss": 0.1328, + "step": 2500 + }, + { + "epoch": 0.164213280994439, + "grad_norm": 1.1009939908981323, + "learning_rate": 2.787777777777778e-05, + "loss": 0.1262, + "step": 2510 + }, + { + "epoch": 0.1648675171736997, + "grad_norm": 1.0970548391342163, + "learning_rate": 2.7988888888888893e-05, + "loss": 0.132, + "step": 2520 + }, + { + "epoch": 0.16552175335296043, + "grad_norm": 1.0594844818115234, + "learning_rate": 2.8100000000000005e-05, + "loss": 0.1267, + "step": 2530 + }, + { + "epoch": 0.16617598953222112, + "grad_norm": 1.1462485790252686, + "learning_rate": 2.821111111111111e-05, + "loss": 0.1214, + "step": 2540 + }, + { + "epoch": 0.16683022571148184, + "grad_norm": 1.044067621231079, + "learning_rate": 2.8322222222222222e-05, + "loss": 0.1287, + "step": 2550 + }, + { + "epoch": 0.16748446189074256, + "grad_norm": 1.0372158288955688, + "learning_rate": 2.8433333333333334e-05, + "loss": 0.1297, + "step": 2560 + }, + { + "epoch": 0.16813869807000328, + "grad_norm": 1.0640093088150024, + "learning_rate": 2.8544444444444446e-05, + "loss": 0.1266, + "step": 2570 + }, + { + "epoch": 0.16879293424926398, + "grad_norm": 0.9317982792854309, + "learning_rate": 2.8655555555555558e-05, + "loss": 0.1283, + "step": 2580 + }, + { + "epoch": 0.1694471704285247, + "grad_norm": 1.5074639320373535, + "learning_rate": 2.876666666666667e-05, + "loss": 0.1105, + "step": 2590 + }, + { + "epoch": 0.17010140660778542, + "grad_norm": 1.1034104824066162, + "learning_rate": 2.8877777777777782e-05, + "loss": 0.1186, + "step": 2600 + }, + { + "epoch": 0.17075564278704614, + "grad_norm": 0.9475632905960083, + "learning_rate": 2.8988888888888887e-05, + "loss": 0.1191, + "step": 2610 + }, + { + "epoch": 0.17140987896630683, + "grad_norm": 0.9753337502479553, + "learning_rate": 2.91e-05, + "loss": 0.115, + "step": 2620 + }, + { + "epoch": 0.17206411514556755, + "grad_norm": 1.0091173648834229, + "learning_rate": 2.921111111111111e-05, + "loss": 0.1132, + "step": 2630 + }, + { + "epoch": 0.17271835132482827, + "grad_norm": 0.8938674330711365, + "learning_rate": 2.9322222222222223e-05, + "loss": 0.1221, + "step": 2640 + }, + { + "epoch": 0.173372587504089, + "grad_norm": 0.9849511384963989, + "learning_rate": 2.9433333333333335e-05, + "loss": 0.1161, + "step": 2650 + }, + { + "epoch": 0.17402682368334968, + "grad_norm": 0.9725368618965149, + "learning_rate": 2.9544444444444447e-05, + "loss": 0.1117, + "step": 2660 + }, + { + "epoch": 0.1746810598626104, + "grad_norm": 0.8807811737060547, + "learning_rate": 2.965555555555556e-05, + "loss": 0.1251, + "step": 2670 + }, + { + "epoch": 0.17533529604187112, + "grad_norm": 1.1785404682159424, + "learning_rate": 2.976666666666667e-05, + "loss": 0.1219, + "step": 2680 + }, + { + "epoch": 0.17598953222113184, + "grad_norm": 0.9174340963363647, + "learning_rate": 2.9877777777777776e-05, + "loss": 0.1099, + "step": 2690 + }, + { + "epoch": 0.17664376840039253, + "grad_norm": 1.0297743082046509, + "learning_rate": 2.9988888888888888e-05, + "loss": 0.1163, + "step": 2700 + }, + { + "epoch": 0.17729800457965325, + "grad_norm": 0.8429158926010132, + "learning_rate": 3.01e-05, + "loss": 0.1147, + "step": 2710 + }, + { + "epoch": 0.17795224075891397, + "grad_norm": 1.0993781089782715, + "learning_rate": 3.0211111111111112e-05, + "loss": 0.1241, + "step": 2720 + }, + { + "epoch": 0.1786064769381747, + "grad_norm": 1.0947730541229248, + "learning_rate": 3.0322222222222224e-05, + "loss": 0.1137, + "step": 2730 + }, + { + "epoch": 0.17926071311743538, + "grad_norm": 1.0554438829421997, + "learning_rate": 3.0433333333333336e-05, + "loss": 0.1211, + "step": 2740 + }, + { + "epoch": 0.1799149492966961, + "grad_norm": 0.987632155418396, + "learning_rate": 3.054444444444445e-05, + "loss": 0.1207, + "step": 2750 + }, + { + "epoch": 0.18056918547595682, + "grad_norm": 1.0157499313354492, + "learning_rate": 3.065555555555556e-05, + "loss": 0.1182, + "step": 2760 + }, + { + "epoch": 0.18122342165521754, + "grad_norm": 0.8833321332931519, + "learning_rate": 3.0766666666666665e-05, + "loss": 0.1202, + "step": 2770 + }, + { + "epoch": 0.18187765783447823, + "grad_norm": 1.0868935585021973, + "learning_rate": 3.087777777777778e-05, + "loss": 0.1207, + "step": 2780 + }, + { + "epoch": 0.18253189401373895, + "grad_norm": 1.1105583906173706, + "learning_rate": 3.098888888888889e-05, + "loss": 0.1242, + "step": 2790 + }, + { + "epoch": 0.18318613019299967, + "grad_norm": 1.014053225517273, + "learning_rate": 3.1100000000000004e-05, + "loss": 0.1176, + "step": 2800 + }, + { + "epoch": 0.1838403663722604, + "grad_norm": 1.2380222082138062, + "learning_rate": 3.121111111111111e-05, + "loss": 0.1231, + "step": 2810 + }, + { + "epoch": 0.1844946025515211, + "grad_norm": 0.9214850664138794, + "learning_rate": 3.132222222222223e-05, + "loss": 0.1258, + "step": 2820 + }, + { + "epoch": 0.1851488387307818, + "grad_norm": 0.9159259796142578, + "learning_rate": 3.1433333333333336e-05, + "loss": 0.1199, + "step": 2830 + }, + { + "epoch": 0.18580307491004253, + "grad_norm": 1.0562981367111206, + "learning_rate": 3.154444444444445e-05, + "loss": 0.12, + "step": 2840 + }, + { + "epoch": 0.18645731108930325, + "grad_norm": 0.9648650288581848, + "learning_rate": 3.165555555555555e-05, + "loss": 0.1168, + "step": 2850 + }, + { + "epoch": 0.18711154726856394, + "grad_norm": 1.028402328491211, + "learning_rate": 3.176666666666667e-05, + "loss": 0.1281, + "step": 2860 + }, + { + "epoch": 0.18776578344782466, + "grad_norm": 0.9670931696891785, + "learning_rate": 3.187777777777778e-05, + "loss": 0.1236, + "step": 2870 + }, + { + "epoch": 0.18842001962708538, + "grad_norm": 0.9317355751991272, + "learning_rate": 3.198888888888889e-05, + "loss": 0.113, + "step": 2880 + }, + { + "epoch": 0.1890742558063461, + "grad_norm": 0.978927493095398, + "learning_rate": 3.21e-05, + "loss": 0.1223, + "step": 2890 + }, + { + "epoch": 0.18972849198560682, + "grad_norm": 0.9678372740745544, + "learning_rate": 3.2211111111111116e-05, + "loss": 0.1246, + "step": 2900 + }, + { + "epoch": 0.1903827281648675, + "grad_norm": 0.856540322303772, + "learning_rate": 3.2322222222222225e-05, + "loss": 0.1192, + "step": 2910 + }, + { + "epoch": 0.19103696434412823, + "grad_norm": 1.0986219644546509, + "learning_rate": 3.243333333333333e-05, + "loss": 0.1189, + "step": 2920 + }, + { + "epoch": 0.19169120052338895, + "grad_norm": 0.9947769641876221, + "learning_rate": 3.254444444444444e-05, + "loss": 0.1222, + "step": 2930 + }, + { + "epoch": 0.19234543670264967, + "grad_norm": 0.9733787178993225, + "learning_rate": 3.265555555555556e-05, + "loss": 0.1214, + "step": 2940 + }, + { + "epoch": 0.19299967288191036, + "grad_norm": 1.0199573040008545, + "learning_rate": 3.2766666666666666e-05, + "loss": 0.1304, + "step": 2950 + }, + { + "epoch": 0.19365390906117108, + "grad_norm": 0.8536534905433655, + "learning_rate": 3.287777777777778e-05, + "loss": 0.1093, + "step": 2960 + }, + { + "epoch": 0.1943081452404318, + "grad_norm": 0.8914840817451477, + "learning_rate": 3.298888888888889e-05, + "loss": 0.1064, + "step": 2970 + }, + { + "epoch": 0.19496238141969252, + "grad_norm": 0.9319343566894531, + "learning_rate": 3.3100000000000005e-05, + "loss": 0.118, + "step": 2980 + }, + { + "epoch": 0.1956166175989532, + "grad_norm": 1.0728380680084229, + "learning_rate": 3.3211111111111114e-05, + "loss": 0.1202, + "step": 2990 + }, + { + "epoch": 0.19627085377821393, + "grad_norm": 0.9401952624320984, + "learning_rate": 3.332222222222222e-05, + "loss": 0.1173, + "step": 3000 + }, + { + "epoch": 0.19692508995747465, + "grad_norm": 0.9090434312820435, + "learning_rate": 3.343333333333333e-05, + "loss": 0.1285, + "step": 3010 + }, + { + "epoch": 0.19757932613673537, + "grad_norm": 1.0409756898880005, + "learning_rate": 3.3544444444444446e-05, + "loss": 0.1172, + "step": 3020 + }, + { + "epoch": 0.19823356231599606, + "grad_norm": 1.026477336883545, + "learning_rate": 3.3655555555555554e-05, + "loss": 0.1277, + "step": 3030 + }, + { + "epoch": 0.19888779849525678, + "grad_norm": 1.1095457077026367, + "learning_rate": 3.376666666666667e-05, + "loss": 0.1131, + "step": 3040 + }, + { + "epoch": 0.1995420346745175, + "grad_norm": 1.1334766149520874, + "learning_rate": 3.387777777777778e-05, + "loss": 0.1327, + "step": 3050 + }, + { + "epoch": 0.20019627085377822, + "grad_norm": 0.8890497088432312, + "learning_rate": 3.3988888888888894e-05, + "loss": 0.1209, + "step": 3060 + }, + { + "epoch": 0.20085050703303892, + "grad_norm": 0.8564671874046326, + "learning_rate": 3.41e-05, + "loss": 0.1148, + "step": 3070 + }, + { + "epoch": 0.20150474321229964, + "grad_norm": 0.9905887842178345, + "learning_rate": 3.421111111111111e-05, + "loss": 0.1045, + "step": 3080 + }, + { + "epoch": 0.20215897939156036, + "grad_norm": 1.105401873588562, + "learning_rate": 3.432222222222222e-05, + "loss": 0.1101, + "step": 3090 + }, + { + "epoch": 0.20281321557082108, + "grad_norm": 1.2804734706878662, + "learning_rate": 3.4433333333333335e-05, + "loss": 0.1264, + "step": 3100 + }, + { + "epoch": 0.20346745175008177, + "grad_norm": 0.8999707698822021, + "learning_rate": 3.454444444444444e-05, + "loss": 0.1089, + "step": 3110 + }, + { + "epoch": 0.2041216879293425, + "grad_norm": 1.0667651891708374, + "learning_rate": 3.465555555555556e-05, + "loss": 0.1171, + "step": 3120 + }, + { + "epoch": 0.2047759241086032, + "grad_norm": 0.9714770913124084, + "learning_rate": 3.476666666666667e-05, + "loss": 0.1118, + "step": 3130 + }, + { + "epoch": 0.20543016028786393, + "grad_norm": 1.0278006792068481, + "learning_rate": 3.487777777777778e-05, + "loss": 0.1239, + "step": 3140 + }, + { + "epoch": 0.20608439646712462, + "grad_norm": 1.0025670528411865, + "learning_rate": 3.498888888888889e-05, + "loss": 0.1171, + "step": 3150 + }, + { + "epoch": 0.20673863264638534, + "grad_norm": 1.191396713256836, + "learning_rate": 3.51e-05, + "loss": 0.1208, + "step": 3160 + }, + { + "epoch": 0.20739286882564606, + "grad_norm": 0.9984032511711121, + "learning_rate": 3.5211111111111115e-05, + "loss": 0.1164, + "step": 3170 + }, + { + "epoch": 0.20804710500490678, + "grad_norm": 0.863865852355957, + "learning_rate": 3.532222222222222e-05, + "loss": 0.1043, + "step": 3180 + }, + { + "epoch": 0.20870134118416747, + "grad_norm": 1.0260671377182007, + "learning_rate": 3.543333333333333e-05, + "loss": 0.1231, + "step": 3190 + }, + { + "epoch": 0.2093555773634282, + "grad_norm": 1.0903806686401367, + "learning_rate": 3.554444444444445e-05, + "loss": 0.1176, + "step": 3200 + }, + { + "epoch": 0.2100098135426889, + "grad_norm": 1.0024579763412476, + "learning_rate": 3.5655555555555556e-05, + "loss": 0.1153, + "step": 3210 + }, + { + "epoch": 0.21066404972194963, + "grad_norm": 1.0972620248794556, + "learning_rate": 3.576666666666667e-05, + "loss": 0.1191, + "step": 3220 + }, + { + "epoch": 0.21131828590121032, + "grad_norm": 0.980950653553009, + "learning_rate": 3.587777777777778e-05, + "loss": 0.1071, + "step": 3230 + }, + { + "epoch": 0.21197252208047104, + "grad_norm": 1.0454057455062866, + "learning_rate": 3.598888888888889e-05, + "loss": 0.1213, + "step": 3240 + }, + { + "epoch": 0.21262675825973176, + "grad_norm": 1.0699424743652344, + "learning_rate": 3.61e-05, + "loss": 0.1195, + "step": 3250 + }, + { + "epoch": 0.21328099443899248, + "grad_norm": 0.956729531288147, + "learning_rate": 3.621111111111111e-05, + "loss": 0.1157, + "step": 3260 + }, + { + "epoch": 0.2139352306182532, + "grad_norm": 1.1061006784439087, + "learning_rate": 3.632222222222223e-05, + "loss": 0.1204, + "step": 3270 + }, + { + "epoch": 0.2145894667975139, + "grad_norm": 1.060990571975708, + "learning_rate": 3.6433333333333336e-05, + "loss": 0.117, + "step": 3280 + }, + { + "epoch": 0.21524370297677461, + "grad_norm": 1.000768780708313, + "learning_rate": 3.654444444444445e-05, + "loss": 0.1206, + "step": 3290 + }, + { + "epoch": 0.21589793915603533, + "grad_norm": 0.9216058850288391, + "learning_rate": 3.665555555555556e-05, + "loss": 0.1127, + "step": 3300 + }, + { + "epoch": 0.21655217533529605, + "grad_norm": 0.9647179841995239, + "learning_rate": 3.676666666666667e-05, + "loss": 0.1204, + "step": 3310 + }, + { + "epoch": 0.21720641151455675, + "grad_norm": 1.077235221862793, + "learning_rate": 3.687777777777778e-05, + "loss": 0.1343, + "step": 3320 + }, + { + "epoch": 0.21786064769381747, + "grad_norm": 1.0138746500015259, + "learning_rate": 3.698888888888889e-05, + "loss": 0.1187, + "step": 3330 + }, + { + "epoch": 0.2185148838730782, + "grad_norm": 0.9533920884132385, + "learning_rate": 3.71e-05, + "loss": 0.1069, + "step": 3340 + }, + { + "epoch": 0.2191691200523389, + "grad_norm": 0.8543539643287659, + "learning_rate": 3.7211111111111116e-05, + "loss": 0.1246, + "step": 3350 + }, + { + "epoch": 0.2198233562315996, + "grad_norm": 0.88579922914505, + "learning_rate": 3.7322222222222224e-05, + "loss": 0.1059, + "step": 3360 + }, + { + "epoch": 0.22047759241086032, + "grad_norm": 0.9485725164413452, + "learning_rate": 3.743333333333334e-05, + "loss": 0.1074, + "step": 3370 + }, + { + "epoch": 0.22113182859012104, + "grad_norm": 1.2649180889129639, + "learning_rate": 3.754444444444445e-05, + "loss": 0.1173, + "step": 3380 + }, + { + "epoch": 0.22178606476938176, + "grad_norm": 0.9635019302368164, + "learning_rate": 3.765555555555556e-05, + "loss": 0.1057, + "step": 3390 + }, + { + "epoch": 0.22244030094864245, + "grad_norm": 1.0199334621429443, + "learning_rate": 3.7766666666666665e-05, + "loss": 0.1113, + "step": 3400 + }, + { + "epoch": 0.22309453712790317, + "grad_norm": 1.4301012754440308, + "learning_rate": 3.787777777777778e-05, + "loss": 0.1258, + "step": 3410 + }, + { + "epoch": 0.2237487733071639, + "grad_norm": 1.0970239639282227, + "learning_rate": 3.798888888888889e-05, + "loss": 0.1142, + "step": 3420 + }, + { + "epoch": 0.2244030094864246, + "grad_norm": 1.0526765584945679, + "learning_rate": 3.8100000000000005e-05, + "loss": 0.1119, + "step": 3430 + }, + { + "epoch": 0.2250572456656853, + "grad_norm": 0.9532793164253235, + "learning_rate": 3.821111111111111e-05, + "loss": 0.1206, + "step": 3440 + }, + { + "epoch": 0.22571148184494602, + "grad_norm": 1.0817819833755493, + "learning_rate": 3.832222222222223e-05, + "loss": 0.1198, + "step": 3450 + }, + { + "epoch": 0.22636571802420674, + "grad_norm": 1.0234626531600952, + "learning_rate": 3.843333333333334e-05, + "loss": 0.117, + "step": 3460 + }, + { + "epoch": 0.22701995420346746, + "grad_norm": 0.9513489007949829, + "learning_rate": 3.8544444444444445e-05, + "loss": 0.1212, + "step": 3470 + }, + { + "epoch": 0.22767419038272815, + "grad_norm": 0.869707465171814, + "learning_rate": 3.8655555555555554e-05, + "loss": 0.1036, + "step": 3480 + }, + { + "epoch": 0.22832842656198887, + "grad_norm": 0.8939826488494873, + "learning_rate": 3.876666666666667e-05, + "loss": 0.1139, + "step": 3490 + }, + { + "epoch": 0.2289826627412496, + "grad_norm": 0.8776119351387024, + "learning_rate": 3.887777777777778e-05, + "loss": 0.1133, + "step": 3500 + }, + { + "epoch": 0.2296368989205103, + "grad_norm": 1.042763590812683, + "learning_rate": 3.898888888888889e-05, + "loss": 0.106, + "step": 3510 + }, + { + "epoch": 0.230291135099771, + "grad_norm": 1.07195246219635, + "learning_rate": 3.91e-05, + "loss": 0.1103, + "step": 3520 + }, + { + "epoch": 0.23094537127903172, + "grad_norm": 0.9736886024475098, + "learning_rate": 3.921111111111112e-05, + "loss": 0.1118, + "step": 3530 + }, + { + "epoch": 0.23159960745829244, + "grad_norm": 1.0525206327438354, + "learning_rate": 3.932222222222222e-05, + "loss": 0.1201, + "step": 3540 + }, + { + "epoch": 0.23225384363755316, + "grad_norm": 0.9918597340583801, + "learning_rate": 3.9433333333333334e-05, + "loss": 0.1199, + "step": 3550 + }, + { + "epoch": 0.23290807981681386, + "grad_norm": 1.1127550601959229, + "learning_rate": 3.954444444444444e-05, + "loss": 0.1211, + "step": 3560 + }, + { + "epoch": 0.23356231599607458, + "grad_norm": 1.1743565797805786, + "learning_rate": 3.965555555555556e-05, + "loss": 0.1142, + "step": 3570 + }, + { + "epoch": 0.2342165521753353, + "grad_norm": 1.09585702419281, + "learning_rate": 3.9766666666666667e-05, + "loss": 0.1281, + "step": 3580 + }, + { + "epoch": 0.23487078835459602, + "grad_norm": 0.9824903011322021, + "learning_rate": 3.987777777777778e-05, + "loss": 0.1139, + "step": 3590 + }, + { + "epoch": 0.23552502453385674, + "grad_norm": 1.1848162412643433, + "learning_rate": 3.998888888888889e-05, + "loss": 0.1074, + "step": 3600 + }, + { + "epoch": 0.23617926071311743, + "grad_norm": 1.1790823936462402, + "learning_rate": 4.0100000000000006e-05, + "loss": 0.1141, + "step": 3610 + }, + { + "epoch": 0.23683349689237815, + "grad_norm": 0.9774477481842041, + "learning_rate": 4.021111111111111e-05, + "loss": 0.1095, + "step": 3620 + }, + { + "epoch": 0.23748773307163887, + "grad_norm": 1.0515410900115967, + "learning_rate": 4.032222222222222e-05, + "loss": 0.1156, + "step": 3630 + }, + { + "epoch": 0.2381419692508996, + "grad_norm": 0.9490527510643005, + "learning_rate": 4.043333333333333e-05, + "loss": 0.1243, + "step": 3640 + }, + { + "epoch": 0.23879620543016028, + "grad_norm": 0.9454949498176575, + "learning_rate": 4.054444444444445e-05, + "loss": 0.1138, + "step": 3650 + }, + { + "epoch": 0.239450441609421, + "grad_norm": 0.9614177346229553, + "learning_rate": 4.0655555555555555e-05, + "loss": 0.1097, + "step": 3660 + }, + { + "epoch": 0.24010467778868172, + "grad_norm": 0.9757283329963684, + "learning_rate": 4.076666666666667e-05, + "loss": 0.1151, + "step": 3670 + }, + { + "epoch": 0.24075891396794244, + "grad_norm": 0.9931315779685974, + "learning_rate": 4.087777777777778e-05, + "loss": 0.1203, + "step": 3680 + }, + { + "epoch": 0.24141315014720313, + "grad_norm": 0.9971386194229126, + "learning_rate": 4.0988888888888894e-05, + "loss": 0.1139, + "step": 3690 + }, + { + "epoch": 0.24206738632646385, + "grad_norm": 1.0257045030593872, + "learning_rate": 4.11e-05, + "loss": 0.1319, + "step": 3700 + }, + { + "epoch": 0.24272162250572457, + "grad_norm": 1.130571722984314, + "learning_rate": 4.121111111111111e-05, + "loss": 0.1159, + "step": 3710 + }, + { + "epoch": 0.2433758586849853, + "grad_norm": 0.893582284450531, + "learning_rate": 4.132222222222222e-05, + "loss": 0.1225, + "step": 3720 + }, + { + "epoch": 0.24403009486424598, + "grad_norm": 0.9939897656440735, + "learning_rate": 4.1433333333333335e-05, + "loss": 0.1016, + "step": 3730 + }, + { + "epoch": 0.2446843310435067, + "grad_norm": 0.9984415173530579, + "learning_rate": 4.1544444444444444e-05, + "loss": 0.1094, + "step": 3740 + }, + { + "epoch": 0.24533856722276742, + "grad_norm": 1.0292317867279053, + "learning_rate": 4.165555555555556e-05, + "loss": 0.1147, + "step": 3750 + }, + { + "epoch": 0.24599280340202814, + "grad_norm": 0.9206924438476562, + "learning_rate": 4.176666666666667e-05, + "loss": 0.1269, + "step": 3760 + }, + { + "epoch": 0.24664703958128883, + "grad_norm": 1.1015186309814453, + "learning_rate": 4.187777777777778e-05, + "loss": 0.1162, + "step": 3770 + }, + { + "epoch": 0.24730127576054955, + "grad_norm": 0.950849175453186, + "learning_rate": 4.198888888888889e-05, + "loss": 0.1153, + "step": 3780 + }, + { + "epoch": 0.24795551193981027, + "grad_norm": 1.0211883783340454, + "learning_rate": 4.21e-05, + "loss": 0.1259, + "step": 3790 + }, + { + "epoch": 0.248609748119071, + "grad_norm": 1.113376259803772, + "learning_rate": 4.2211111111111115e-05, + "loss": 0.1198, + "step": 3800 + }, + { + "epoch": 0.2492639842983317, + "grad_norm": 1.0689711570739746, + "learning_rate": 4.2322222222222224e-05, + "loss": 0.1199, + "step": 3810 + }, + { + "epoch": 0.2499182204775924, + "grad_norm": 1.1541417837142944, + "learning_rate": 4.243333333333334e-05, + "loss": 0.1148, + "step": 3820 + }, + { + "epoch": 0.2505724566568531, + "grad_norm": 0.9707340598106384, + "learning_rate": 4.254444444444445e-05, + "loss": 0.1298, + "step": 3830 + }, + { + "epoch": 0.2512266928361138, + "grad_norm": 1.2857780456542969, + "learning_rate": 4.2655555555555556e-05, + "loss": 0.1298, + "step": 3840 + }, + { + "epoch": 0.25188092901537457, + "grad_norm": 0.8688610792160034, + "learning_rate": 4.2766666666666665e-05, + "loss": 0.1169, + "step": 3850 + }, + { + "epoch": 0.25253516519463526, + "grad_norm": 1.0786139965057373, + "learning_rate": 4.287777777777778e-05, + "loss": 0.1279, + "step": 3860 + }, + { + "epoch": 0.25318940137389595, + "grad_norm": 1.1422791481018066, + "learning_rate": 4.298888888888889e-05, + "loss": 0.1114, + "step": 3870 + }, + { + "epoch": 0.2538436375531567, + "grad_norm": 0.9638531804084778, + "learning_rate": 4.3100000000000004e-05, + "loss": 0.1144, + "step": 3880 + }, + { + "epoch": 0.2544978737324174, + "grad_norm": 1.1392393112182617, + "learning_rate": 4.321111111111111e-05, + "loss": 0.1087, + "step": 3890 + }, + { + "epoch": 0.25515210991167814, + "grad_norm": 0.9770271182060242, + "learning_rate": 4.332222222222223e-05, + "loss": 0.1305, + "step": 3900 + }, + { + "epoch": 0.25580634609093883, + "grad_norm": 0.9613223075866699, + "learning_rate": 4.3433333333333336e-05, + "loss": 0.1093, + "step": 3910 + }, + { + "epoch": 0.2564605822701995, + "grad_norm": 0.9954821467399597, + "learning_rate": 4.354444444444445e-05, + "loss": 0.1201, + "step": 3920 + }, + { + "epoch": 0.25711481844946027, + "grad_norm": 1.0677788257598877, + "learning_rate": 4.3655555555555554e-05, + "loss": 0.1102, + "step": 3930 + }, + { + "epoch": 0.25776905462872096, + "grad_norm": 1.1282657384872437, + "learning_rate": 4.376666666666667e-05, + "loss": 0.118, + "step": 3940 + }, + { + "epoch": 0.25842329080798165, + "grad_norm": 1.0023534297943115, + "learning_rate": 4.387777777777778e-05, + "loss": 0.1144, + "step": 3950 + }, + { + "epoch": 0.2590775269872424, + "grad_norm": 0.9646608233451843, + "learning_rate": 4.398888888888889e-05, + "loss": 0.1106, + "step": 3960 + }, + { + "epoch": 0.2597317631665031, + "grad_norm": 1.139906406402588, + "learning_rate": 4.41e-05, + "loss": 0.1134, + "step": 3970 + }, + { + "epoch": 0.26038599934576384, + "grad_norm": 0.960753858089447, + "learning_rate": 4.4211111111111117e-05, + "loss": 0.1142, + "step": 3980 + }, + { + "epoch": 0.26104023552502453, + "grad_norm": 1.0693498849868774, + "learning_rate": 4.4322222222222225e-05, + "loss": 0.1091, + "step": 3990 + }, + { + "epoch": 0.2616944717042852, + "grad_norm": 1.1489149332046509, + "learning_rate": 4.443333333333334e-05, + "loss": 0.107, + "step": 4000 + }, + { + "epoch": 0.262348707883546, + "grad_norm": 0.9810730218887329, + "learning_rate": 4.454444444444444e-05, + "loss": 0.1197, + "step": 4010 + }, + { + "epoch": 0.26300294406280667, + "grad_norm": 0.9745346307754517, + "learning_rate": 4.465555555555556e-05, + "loss": 0.1158, + "step": 4020 + }, + { + "epoch": 0.2636571802420674, + "grad_norm": 1.056600570678711, + "learning_rate": 4.4766666666666666e-05, + "loss": 0.1102, + "step": 4030 + }, + { + "epoch": 0.2643114164213281, + "grad_norm": 1.090558409690857, + "learning_rate": 4.487777777777778e-05, + "loss": 0.1151, + "step": 4040 + }, + { + "epoch": 0.2649656526005888, + "grad_norm": 0.9222038984298706, + "learning_rate": 4.498888888888889e-05, + "loss": 0.1103, + "step": 4050 + }, + { + "epoch": 0.26561988877984954, + "grad_norm": 1.0304709672927856, + "learning_rate": 4.5100000000000005e-05, + "loss": 0.1169, + "step": 4060 + }, + { + "epoch": 0.26627412495911024, + "grad_norm": 1.0132149457931519, + "learning_rate": 4.5211111111111114e-05, + "loss": 0.1074, + "step": 4070 + }, + { + "epoch": 0.26692836113837093, + "grad_norm": 1.0685970783233643, + "learning_rate": 4.532222222222223e-05, + "loss": 0.1087, + "step": 4080 + }, + { + "epoch": 0.2675825973176317, + "grad_norm": 1.0187596082687378, + "learning_rate": 4.543333333333333e-05, + "loss": 0.1225, + "step": 4090 + }, + { + "epoch": 0.26823683349689237, + "grad_norm": 1.0164580345153809, + "learning_rate": 4.5544444444444446e-05, + "loss": 0.119, + "step": 4100 + }, + { + "epoch": 0.2688910696761531, + "grad_norm": 0.9964419007301331, + "learning_rate": 4.5655555555555555e-05, + "loss": 0.1077, + "step": 4110 + }, + { + "epoch": 0.2695453058554138, + "grad_norm": 1.1001616716384888, + "learning_rate": 4.576666666666667e-05, + "loss": 0.1203, + "step": 4120 + }, + { + "epoch": 0.2701995420346745, + "grad_norm": 0.984094500541687, + "learning_rate": 4.587777777777778e-05, + "loss": 0.1219, + "step": 4130 + }, + { + "epoch": 0.27085377821393525, + "grad_norm": 0.8646033406257629, + "learning_rate": 4.5988888888888894e-05, + "loss": 0.1144, + "step": 4140 + }, + { + "epoch": 0.27150801439319594, + "grad_norm": 0.8888686299324036, + "learning_rate": 4.61e-05, + "loss": 0.1216, + "step": 4150 + }, + { + "epoch": 0.27216225057245663, + "grad_norm": 0.9775686264038086, + "learning_rate": 4.621111111111111e-05, + "loss": 0.1119, + "step": 4160 + }, + { + "epoch": 0.2728164867517174, + "grad_norm": 1.1479732990264893, + "learning_rate": 4.632222222222222e-05, + "loss": 0.1159, + "step": 4170 + }, + { + "epoch": 0.27347072293097807, + "grad_norm": 0.9950860738754272, + "learning_rate": 4.6433333333333335e-05, + "loss": 0.1151, + "step": 4180 + }, + { + "epoch": 0.2741249591102388, + "grad_norm": 1.044150948524475, + "learning_rate": 4.6544444444444443e-05, + "loss": 0.1029, + "step": 4190 + }, + { + "epoch": 0.2747791952894995, + "grad_norm": 0.9072064757347107, + "learning_rate": 4.665555555555556e-05, + "loss": 0.1217, + "step": 4200 + }, + { + "epoch": 0.2754334314687602, + "grad_norm": 1.0930536985397339, + "learning_rate": 4.676666666666667e-05, + "loss": 0.1161, + "step": 4210 + }, + { + "epoch": 0.27608766764802095, + "grad_norm": 1.104744791984558, + "learning_rate": 4.687777777777778e-05, + "loss": 0.1127, + "step": 4220 + }, + { + "epoch": 0.27674190382728164, + "grad_norm": 0.8702093958854675, + "learning_rate": 4.698888888888889e-05, + "loss": 0.1117, + "step": 4230 + }, + { + "epoch": 0.27739614000654234, + "grad_norm": 1.0562191009521484, + "learning_rate": 4.71e-05, + "loss": 0.1172, + "step": 4240 + }, + { + "epoch": 0.2780503761858031, + "grad_norm": 1.0193010568618774, + "learning_rate": 4.721111111111111e-05, + "loss": 0.1117, + "step": 4250 + }, + { + "epoch": 0.2787046123650638, + "grad_norm": 1.2876737117767334, + "learning_rate": 4.7322222222222224e-05, + "loss": 0.1174, + "step": 4260 + }, + { + "epoch": 0.2793588485443245, + "grad_norm": 1.0730646848678589, + "learning_rate": 4.743333333333333e-05, + "loss": 0.1136, + "step": 4270 + }, + { + "epoch": 0.2800130847235852, + "grad_norm": 1.0272183418273926, + "learning_rate": 4.754444444444445e-05, + "loss": 0.1042, + "step": 4280 + }, + { + "epoch": 0.2806673209028459, + "grad_norm": 0.9593018293380737, + "learning_rate": 4.7655555555555556e-05, + "loss": 0.1068, + "step": 4290 + }, + { + "epoch": 0.28132155708210665, + "grad_norm": 1.1273571252822876, + "learning_rate": 4.776666666666667e-05, + "loss": 0.1183, + "step": 4300 + }, + { + "epoch": 0.28197579326136735, + "grad_norm": 0.9116014838218689, + "learning_rate": 4.787777777777778e-05, + "loss": 0.1114, + "step": 4310 + }, + { + "epoch": 0.2826300294406281, + "grad_norm": 0.9448710680007935, + "learning_rate": 4.798888888888889e-05, + "loss": 0.1155, + "step": 4320 + }, + { + "epoch": 0.2832842656198888, + "grad_norm": 0.9872847199440002, + "learning_rate": 4.8100000000000004e-05, + "loss": 0.1072, + "step": 4330 + }, + { + "epoch": 0.2839385017991495, + "grad_norm": 0.9878488779067993, + "learning_rate": 4.821111111111111e-05, + "loss": 0.1187, + "step": 4340 + }, + { + "epoch": 0.2845927379784102, + "grad_norm": 1.1943411827087402, + "learning_rate": 4.832222222222223e-05, + "loss": 0.1299, + "step": 4350 + }, + { + "epoch": 0.2852469741576709, + "grad_norm": 1.0091590881347656, + "learning_rate": 4.8433333333333336e-05, + "loss": 0.1205, + "step": 4360 + }, + { + "epoch": 0.2859012103369316, + "grad_norm": 1.0240885019302368, + "learning_rate": 4.8544444444444445e-05, + "loss": 0.11, + "step": 4370 + }, + { + "epoch": 0.28655544651619236, + "grad_norm": 0.8469128012657166, + "learning_rate": 4.865555555555556e-05, + "loss": 0.1223, + "step": 4380 + }, + { + "epoch": 0.28720968269545305, + "grad_norm": 0.932327389717102, + "learning_rate": 4.876666666666667e-05, + "loss": 0.1061, + "step": 4390 + }, + { + "epoch": 0.2878639188747138, + "grad_norm": 0.9431477785110474, + "learning_rate": 4.887777777777778e-05, + "loss": 0.1064, + "step": 4400 + }, + { + "epoch": 0.2885181550539745, + "grad_norm": 0.9958844780921936, + "learning_rate": 4.898888888888889e-05, + "loss": 0.112, + "step": 4410 + }, + { + "epoch": 0.2891723912332352, + "grad_norm": 1.146600365638733, + "learning_rate": 4.91e-05, + "loss": 0.1189, + "step": 4420 + }, + { + "epoch": 0.28982662741249593, + "grad_norm": 1.0450646877288818, + "learning_rate": 4.9211111111111116e-05, + "loss": 0.1268, + "step": 4430 + }, + { + "epoch": 0.2904808635917566, + "grad_norm": 1.2781046628952026, + "learning_rate": 4.9322222222222225e-05, + "loss": 0.123, + "step": 4440 + }, + { + "epoch": 0.2911350997710173, + "grad_norm": 0.9840906858444214, + "learning_rate": 4.943333333333334e-05, + "loss": 0.1139, + "step": 4450 + }, + { + "epoch": 0.29178933595027806, + "grad_norm": 0.9794589281082153, + "learning_rate": 4.954444444444445e-05, + "loss": 0.1176, + "step": 4460 + }, + { + "epoch": 0.29244357212953875, + "grad_norm": 1.2850165367126465, + "learning_rate": 4.965555555555556e-05, + "loss": 0.1245, + "step": 4470 + }, + { + "epoch": 0.2930978083087995, + "grad_norm": 0.9761756062507629, + "learning_rate": 4.9766666666666666e-05, + "loss": 0.1052, + "step": 4480 + }, + { + "epoch": 0.2937520444880602, + "grad_norm": 0.9061872959136963, + "learning_rate": 4.987777777777778e-05, + "loss": 0.1102, + "step": 4490 + }, + { + "epoch": 0.2944062806673209, + "grad_norm": 1.1952383518218994, + "learning_rate": 4.998888888888889e-05, + "loss": 0.1046, + "step": 4500 + }, + { + "epoch": 0.29506051684658163, + "grad_norm": 1.018266201019287, + "learning_rate": 5.0100000000000005e-05, + "loss": 0.1126, + "step": 4510 + }, + { + "epoch": 0.2957147530258423, + "grad_norm": 1.0872063636779785, + "learning_rate": 5.0211111111111107e-05, + "loss": 0.1123, + "step": 4520 + }, + { + "epoch": 0.296368989205103, + "grad_norm": 1.1259245872497559, + "learning_rate": 5.032222222222223e-05, + "loss": 0.1138, + "step": 4530 + }, + { + "epoch": 0.29702322538436376, + "grad_norm": 0.939315676689148, + "learning_rate": 5.043333333333333e-05, + "loss": 0.112, + "step": 4540 + }, + { + "epoch": 0.29767746156362446, + "grad_norm": 0.9471316933631897, + "learning_rate": 5.054444444444445e-05, + "loss": 0.116, + "step": 4550 + }, + { + "epoch": 0.2983316977428852, + "grad_norm": 1.0100241899490356, + "learning_rate": 5.0655555555555554e-05, + "loss": 0.105, + "step": 4560 + }, + { + "epoch": 0.2989859339221459, + "grad_norm": 1.010973572731018, + "learning_rate": 5.0766666666666676e-05, + "loss": 0.0968, + "step": 4570 + }, + { + "epoch": 0.2996401701014066, + "grad_norm": 1.0246537923812866, + "learning_rate": 5.087777777777778e-05, + "loss": 0.1301, + "step": 4580 + }, + { + "epoch": 0.30029440628066734, + "grad_norm": 1.0651960372924805, + "learning_rate": 5.098888888888889e-05, + "loss": 0.1184, + "step": 4590 + }, + { + "epoch": 0.30094864245992803, + "grad_norm": 1.0061384439468384, + "learning_rate": 5.11e-05, + "loss": 0.1238, + "step": 4600 + }, + { + "epoch": 0.3016028786391887, + "grad_norm": 1.0411393642425537, + "learning_rate": 5.121111111111111e-05, + "loss": 0.1066, + "step": 4610 + }, + { + "epoch": 0.30225711481844947, + "grad_norm": 1.1767691373825073, + "learning_rate": 5.1322222222222226e-05, + "loss": 0.1176, + "step": 4620 + }, + { + "epoch": 0.30291135099771016, + "grad_norm": 1.1108002662658691, + "learning_rate": 5.1433333333333334e-05, + "loss": 0.1231, + "step": 4630 + }, + { + "epoch": 0.3035655871769709, + "grad_norm": 0.9178494811058044, + "learning_rate": 5.154444444444445e-05, + "loss": 0.1219, + "step": 4640 + }, + { + "epoch": 0.3042198233562316, + "grad_norm": 1.046704649925232, + "learning_rate": 5.165555555555556e-05, + "loss": 0.108, + "step": 4650 + }, + { + "epoch": 0.3048740595354923, + "grad_norm": 1.1278116703033447, + "learning_rate": 5.1766666666666674e-05, + "loss": 0.1205, + "step": 4660 + }, + { + "epoch": 0.30552829571475304, + "grad_norm": 1.049989104270935, + "learning_rate": 5.187777777777778e-05, + "loss": 0.1195, + "step": 4670 + }, + { + "epoch": 0.30618253189401373, + "grad_norm": 1.1788783073425293, + "learning_rate": 5.1988888888888884e-05, + "loss": 0.112, + "step": 4680 + }, + { + "epoch": 0.3068367680732745, + "grad_norm": 1.094664454460144, + "learning_rate": 5.2100000000000006e-05, + "loss": 0.1104, + "step": 4690 + }, + { + "epoch": 0.30749100425253517, + "grad_norm": 1.0641672611236572, + "learning_rate": 5.221111111111111e-05, + "loss": 0.1209, + "step": 4700 + }, + { + "epoch": 0.30814524043179586, + "grad_norm": 0.9625852108001709, + "learning_rate": 5.232222222222223e-05, + "loss": 0.1171, + "step": 4710 + }, + { + "epoch": 0.3087994766110566, + "grad_norm": 0.9965457916259766, + "learning_rate": 5.243333333333333e-05, + "loss": 0.1152, + "step": 4720 + }, + { + "epoch": 0.3094537127903173, + "grad_norm": 1.1874282360076904, + "learning_rate": 5.2544444444444454e-05, + "loss": 0.1069, + "step": 4730 + }, + { + "epoch": 0.310107948969578, + "grad_norm": 1.071972370147705, + "learning_rate": 5.2655555555555555e-05, + "loss": 0.1207, + "step": 4740 + }, + { + "epoch": 0.31076218514883874, + "grad_norm": 1.1899669170379639, + "learning_rate": 5.2766666666666664e-05, + "loss": 0.1003, + "step": 4750 + }, + { + "epoch": 0.31141642132809944, + "grad_norm": 1.1113231182098389, + "learning_rate": 5.287777777777778e-05, + "loss": 0.1101, + "step": 4760 + }, + { + "epoch": 0.3120706575073602, + "grad_norm": 0.9851582646369934, + "learning_rate": 5.298888888888889e-05, + "loss": 0.1246, + "step": 4770 + }, + { + "epoch": 0.3127248936866209, + "grad_norm": 1.1461477279663086, + "learning_rate": 5.31e-05, + "loss": 0.1136, + "step": 4780 + }, + { + "epoch": 0.31337912986588157, + "grad_norm": 1.0127012729644775, + "learning_rate": 5.321111111111111e-05, + "loss": 0.1082, + "step": 4790 + }, + { + "epoch": 0.3140333660451423, + "grad_norm": 1.0114226341247559, + "learning_rate": 5.332222222222223e-05, + "loss": 0.123, + "step": 4800 + }, + { + "epoch": 0.314687602224403, + "grad_norm": 1.0806419849395752, + "learning_rate": 5.3433333333333336e-05, + "loss": 0.1239, + "step": 4810 + }, + { + "epoch": 0.3153418384036637, + "grad_norm": 0.9436666369438171, + "learning_rate": 5.3544444444444444e-05, + "loss": 0.1127, + "step": 4820 + }, + { + "epoch": 0.31599607458292445, + "grad_norm": 0.9551024436950684, + "learning_rate": 5.365555555555556e-05, + "loss": 0.1146, + "step": 4830 + }, + { + "epoch": 0.31665031076218514, + "grad_norm": 0.9468205571174622, + "learning_rate": 5.376666666666667e-05, + "loss": 0.1099, + "step": 4840 + }, + { + "epoch": 0.3173045469414459, + "grad_norm": 1.0313448905944824, + "learning_rate": 5.387777777777778e-05, + "loss": 0.1108, + "step": 4850 + }, + { + "epoch": 0.3179587831207066, + "grad_norm": 1.025217890739441, + "learning_rate": 5.398888888888889e-05, + "loss": 0.1245, + "step": 4860 + }, + { + "epoch": 0.31861301929996727, + "grad_norm": 0.9557814002037048, + "learning_rate": 5.410000000000001e-05, + "loss": 0.1099, + "step": 4870 + }, + { + "epoch": 0.319267255479228, + "grad_norm": 1.0116934776306152, + "learning_rate": 5.4211111111111116e-05, + "loss": 0.1068, + "step": 4880 + }, + { + "epoch": 0.3199214916584887, + "grad_norm": 1.0298535823822021, + "learning_rate": 5.432222222222223e-05, + "loss": 0.1261, + "step": 4890 + }, + { + "epoch": 0.3205757278377494, + "grad_norm": 1.6107592582702637, + "learning_rate": 5.443333333333333e-05, + "loss": 0.1166, + "step": 4900 + }, + { + "epoch": 0.32122996401701015, + "grad_norm": 0.9493816494941711, + "learning_rate": 5.454444444444444e-05, + "loss": 0.1025, + "step": 4910 + }, + { + "epoch": 0.32188420019627084, + "grad_norm": 1.0891478061676025, + "learning_rate": 5.465555555555556e-05, + "loss": 0.1259, + "step": 4920 + }, + { + "epoch": 0.3225384363755316, + "grad_norm": 1.0789752006530762, + "learning_rate": 5.4766666666666665e-05, + "loss": 0.1092, + "step": 4930 + }, + { + "epoch": 0.3231926725547923, + "grad_norm": 0.8456171154975891, + "learning_rate": 5.487777777777778e-05, + "loss": 0.1094, + "step": 4940 + }, + { + "epoch": 0.323846908734053, + "grad_norm": 1.1265510320663452, + "learning_rate": 5.498888888888889e-05, + "loss": 0.1105, + "step": 4950 + }, + { + "epoch": 0.3245011449133137, + "grad_norm": 1.0033330917358398, + "learning_rate": 5.5100000000000004e-05, + "loss": 0.1138, + "step": 4960 + }, + { + "epoch": 0.3251553810925744, + "grad_norm": 0.9532373547554016, + "learning_rate": 5.521111111111111e-05, + "loss": 0.1134, + "step": 4970 + }, + { + "epoch": 0.3258096172718351, + "grad_norm": 1.070513367652893, + "learning_rate": 5.532222222222222e-05, + "loss": 0.1054, + "step": 4980 + }, + { + "epoch": 0.32646385345109585, + "grad_norm": 1.1604113578796387, + "learning_rate": 5.543333333333334e-05, + "loss": 0.1089, + "step": 4990 + }, + { + "epoch": 0.32711808963035655, + "grad_norm": 0.9671838283538818, + "learning_rate": 5.5544444444444445e-05, + "loss": 0.1114, + "step": 5000 + }, + { + "epoch": 0.3277723258096173, + "grad_norm": 1.0246025323867798, + "learning_rate": 5.565555555555556e-05, + "loss": 0.1155, + "step": 5010 + }, + { + "epoch": 0.328426561988878, + "grad_norm": 1.1723190546035767, + "learning_rate": 5.576666666666667e-05, + "loss": 0.1086, + "step": 5020 + }, + { + "epoch": 0.3290807981681387, + "grad_norm": 1.158227801322937, + "learning_rate": 5.5877777777777785e-05, + "loss": 0.1142, + "step": 5030 + }, + { + "epoch": 0.3297350343473994, + "grad_norm": 1.070424199104309, + "learning_rate": 5.598888888888889e-05, + "loss": 0.1075, + "step": 5040 + }, + { + "epoch": 0.3303892705266601, + "grad_norm": 1.1170357465744019, + "learning_rate": 5.610000000000001e-05, + "loss": 0.1242, + "step": 5050 + }, + { + "epoch": 0.33104350670592086, + "grad_norm": 1.009589672088623, + "learning_rate": 5.621111111111112e-05, + "loss": 0.1076, + "step": 5060 + }, + { + "epoch": 0.33169774288518156, + "grad_norm": 1.002823829650879, + "learning_rate": 5.632222222222222e-05, + "loss": 0.1146, + "step": 5070 + }, + { + "epoch": 0.33235197906444225, + "grad_norm": 0.9945969581604004, + "learning_rate": 5.643333333333334e-05, + "loss": 0.1155, + "step": 5080 + }, + { + "epoch": 0.333006215243703, + "grad_norm": 0.9207086563110352, + "learning_rate": 5.654444444444444e-05, + "loss": 0.116, + "step": 5090 + }, + { + "epoch": 0.3336604514229637, + "grad_norm": 0.8761827945709229, + "learning_rate": 5.6655555555555565e-05, + "loss": 0.1119, + "step": 5100 + }, + { + "epoch": 0.3343146876022244, + "grad_norm": 1.128303050994873, + "learning_rate": 5.6766666666666666e-05, + "loss": 0.1134, + "step": 5110 + }, + { + "epoch": 0.33496892378148513, + "grad_norm": 0.9229829907417297, + "learning_rate": 5.687777777777779e-05, + "loss": 0.1085, + "step": 5120 + }, + { + "epoch": 0.3356231599607458, + "grad_norm": 1.0717655420303345, + "learning_rate": 5.698888888888889e-05, + "loss": 0.1154, + "step": 5130 + }, + { + "epoch": 0.33627739614000657, + "grad_norm": 1.0167633295059204, + "learning_rate": 5.71e-05, + "loss": 0.1185, + "step": 5140 + }, + { + "epoch": 0.33693163231926726, + "grad_norm": 1.1209583282470703, + "learning_rate": 5.7211111111111114e-05, + "loss": 0.1182, + "step": 5150 + }, + { + "epoch": 0.33758586849852795, + "grad_norm": 0.9364834427833557, + "learning_rate": 5.732222222222222e-05, + "loss": 0.1024, + "step": 5160 + }, + { + "epoch": 0.3382401046777887, + "grad_norm": 1.077996850013733, + "learning_rate": 5.743333333333334e-05, + "loss": 0.1076, + "step": 5170 + }, + { + "epoch": 0.3388943408570494, + "grad_norm": 0.946433424949646, + "learning_rate": 5.7544444444444447e-05, + "loss": 0.1176, + "step": 5180 + }, + { + "epoch": 0.3395485770363101, + "grad_norm": 0.9579144716262817, + "learning_rate": 5.765555555555556e-05, + "loss": 0.1077, + "step": 5190 + }, + { + "epoch": 0.34020281321557083, + "grad_norm": 0.9725630283355713, + "learning_rate": 5.776666666666667e-05, + "loss": 0.1123, + "step": 5200 + }, + { + "epoch": 0.3408570493948315, + "grad_norm": 1.1612457036972046, + "learning_rate": 5.787777777777777e-05, + "loss": 0.1079, + "step": 5210 + }, + { + "epoch": 0.34151128557409227, + "grad_norm": 1.1368802785873413, + "learning_rate": 5.7988888888888894e-05, + "loss": 0.1145, + "step": 5220 + }, + { + "epoch": 0.34216552175335296, + "grad_norm": 1.130491852760315, + "learning_rate": 5.8099999999999996e-05, + "loss": 0.1127, + "step": 5230 + }, + { + "epoch": 0.34281975793261366, + "grad_norm": 1.1472586393356323, + "learning_rate": 5.821111111111112e-05, + "loss": 0.1067, + "step": 5240 + }, + { + "epoch": 0.3434739941118744, + "grad_norm": 1.2415672540664673, + "learning_rate": 5.832222222222222e-05, + "loss": 0.1035, + "step": 5250 + }, + { + "epoch": 0.3441282302911351, + "grad_norm": 0.952216625213623, + "learning_rate": 5.843333333333334e-05, + "loss": 0.1211, + "step": 5260 + }, + { + "epoch": 0.3447824664703958, + "grad_norm": 0.9842739701271057, + "learning_rate": 5.8544444444444444e-05, + "loss": 0.1081, + "step": 5270 + }, + { + "epoch": 0.34543670264965654, + "grad_norm": 0.921035885810852, + "learning_rate": 5.8655555555555566e-05, + "loss": 0.1063, + "step": 5280 + }, + { + "epoch": 0.3460909388289172, + "grad_norm": 1.028622031211853, + "learning_rate": 5.876666666666667e-05, + "loss": 0.1114, + "step": 5290 + }, + { + "epoch": 0.346745175008178, + "grad_norm": 1.103540062904358, + "learning_rate": 5.8877777777777776e-05, + "loss": 0.1144, + "step": 5300 + }, + { + "epoch": 0.34739941118743867, + "grad_norm": 0.9521011710166931, + "learning_rate": 5.898888888888889e-05, + "loss": 0.1104, + "step": 5310 + }, + { + "epoch": 0.34805364736669936, + "grad_norm": 1.1249808073043823, + "learning_rate": 5.91e-05, + "loss": 0.1235, + "step": 5320 + }, + { + "epoch": 0.3487078835459601, + "grad_norm": 1.0736548900604248, + "learning_rate": 5.9211111111111115e-05, + "loss": 0.1114, + "step": 5330 + }, + { + "epoch": 0.3493621197252208, + "grad_norm": 0.9119420647621155, + "learning_rate": 5.9322222222222224e-05, + "loss": 0.1234, + "step": 5340 + }, + { + "epoch": 0.3500163559044815, + "grad_norm": 0.9707286953926086, + "learning_rate": 5.943333333333334e-05, + "loss": 0.1139, + "step": 5350 + }, + { + "epoch": 0.35067059208374224, + "grad_norm": 0.8778756260871887, + "learning_rate": 5.954444444444445e-05, + "loss": 0.1119, + "step": 5360 + }, + { + "epoch": 0.35132482826300293, + "grad_norm": 0.9317549467086792, + "learning_rate": 5.9655555555555556e-05, + "loss": 0.1162, + "step": 5370 + }, + { + "epoch": 0.3519790644422637, + "grad_norm": 0.9935291409492493, + "learning_rate": 5.976666666666667e-05, + "loss": 0.115, + "step": 5380 + }, + { + "epoch": 0.35263330062152437, + "grad_norm": 0.8970289826393127, + "learning_rate": 5.987777777777778e-05, + "loss": 0.1094, + "step": 5390 + }, + { + "epoch": 0.35328753680078506, + "grad_norm": 1.034538745880127, + "learning_rate": 5.9988888888888895e-05, + "loss": 0.1097, + "step": 5400 + }, + { + "epoch": 0.3539417729800458, + "grad_norm": 0.9625052809715271, + "learning_rate": 6.0100000000000004e-05, + "loss": 0.1254, + "step": 5410 + }, + { + "epoch": 0.3545960091593065, + "grad_norm": 1.0226140022277832, + "learning_rate": 6.021111111111112e-05, + "loss": 0.1089, + "step": 5420 + }, + { + "epoch": 0.35525024533856725, + "grad_norm": 0.9254044890403748, + "learning_rate": 6.032222222222222e-05, + "loss": 0.1233, + "step": 5430 + }, + { + "epoch": 0.35590448151782794, + "grad_norm": 0.9888849854469299, + "learning_rate": 6.043333333333333e-05, + "loss": 0.1031, + "step": 5440 + }, + { + "epoch": 0.35655871769708863, + "grad_norm": 0.9793877601623535, + "learning_rate": 6.0544444444444445e-05, + "loss": 0.1127, + "step": 5450 + }, + { + "epoch": 0.3572129538763494, + "grad_norm": 0.9079664945602417, + "learning_rate": 6.0655555555555553e-05, + "loss": 0.1088, + "step": 5460 + }, + { + "epoch": 0.3578671900556101, + "grad_norm": 1.0613657236099243, + "learning_rate": 6.076666666666667e-05, + "loss": 0.1047, + "step": 5470 + }, + { + "epoch": 0.35852142623487077, + "grad_norm": 1.1257914304733276, + "learning_rate": 6.087777777777778e-05, + "loss": 0.1197, + "step": 5480 + }, + { + "epoch": 0.3591756624141315, + "grad_norm": 0.9039554595947266, + "learning_rate": 6.098888888888889e-05, + "loss": 0.1103, + "step": 5490 + }, + { + "epoch": 0.3598298985933922, + "grad_norm": 1.026854395866394, + "learning_rate": 6.110000000000001e-05, + "loss": 0.1116, + "step": 5500 + }, + { + "epoch": 0.36048413477265295, + "grad_norm": 1.1896244287490845, + "learning_rate": 6.121111111111112e-05, + "loss": 0.1136, + "step": 5510 + }, + { + "epoch": 0.36113837095191365, + "grad_norm": 0.9056310057640076, + "learning_rate": 6.132222222222223e-05, + "loss": 0.1134, + "step": 5520 + }, + { + "epoch": 0.36179260713117434, + "grad_norm": 1.078660249710083, + "learning_rate": 6.143333333333333e-05, + "loss": 0.1063, + "step": 5530 + }, + { + "epoch": 0.3624468433104351, + "grad_norm": 1.1922471523284912, + "learning_rate": 6.154444444444446e-05, + "loss": 0.1116, + "step": 5540 + }, + { + "epoch": 0.3631010794896958, + "grad_norm": 1.298832893371582, + "learning_rate": 6.165555555555556e-05, + "loss": 0.1252, + "step": 5550 + }, + { + "epoch": 0.36375531566895647, + "grad_norm": 0.9783083200454712, + "learning_rate": 6.176666666666667e-05, + "loss": 0.1143, + "step": 5560 + }, + { + "epoch": 0.3644095518482172, + "grad_norm": 1.0835789442062378, + "learning_rate": 6.187777777777777e-05, + "loss": 0.1189, + "step": 5570 + }, + { + "epoch": 0.3650637880274779, + "grad_norm": 1.1260126829147339, + "learning_rate": 6.198888888888889e-05, + "loss": 0.1092, + "step": 5580 + }, + { + "epoch": 0.36571802420673866, + "grad_norm": 1.1071107387542725, + "learning_rate": 6.21e-05, + "loss": 0.1108, + "step": 5590 + }, + { + "epoch": 0.36637226038599935, + "grad_norm": 1.383540153503418, + "learning_rate": 6.221111111111111e-05, + "loss": 0.1208, + "step": 5600 + }, + { + "epoch": 0.36702649656526004, + "grad_norm": 1.0578371286392212, + "learning_rate": 6.232222222222222e-05, + "loss": 0.1105, + "step": 5610 + }, + { + "epoch": 0.3676807327445208, + "grad_norm": 0.9242531061172485, + "learning_rate": 6.243333333333334e-05, + "loss": 0.1026, + "step": 5620 + }, + { + "epoch": 0.3683349689237815, + "grad_norm": 1.0212069749832153, + "learning_rate": 6.254444444444445e-05, + "loss": 0.1121, + "step": 5630 + }, + { + "epoch": 0.3689892051030422, + "grad_norm": 1.075881004333496, + "learning_rate": 6.265555555555555e-05, + "loss": 0.107, + "step": 5640 + }, + { + "epoch": 0.3696434412823029, + "grad_norm": 0.993894100189209, + "learning_rate": 6.276666666666667e-05, + "loss": 0.1202, + "step": 5650 + }, + { + "epoch": 0.3702976774615636, + "grad_norm": 1.1158571243286133, + "learning_rate": 6.287777777777779e-05, + "loss": 0.1151, + "step": 5660 + }, + { + "epoch": 0.37095191364082436, + "grad_norm": 0.9299028515815735, + "learning_rate": 6.29888888888889e-05, + "loss": 0.1203, + "step": 5670 + }, + { + "epoch": 0.37160614982008505, + "grad_norm": 0.9454100131988525, + "learning_rate": 6.31e-05, + "loss": 0.1105, + "step": 5680 + }, + { + "epoch": 0.37226038599934574, + "grad_norm": 0.9291905164718628, + "learning_rate": 6.32111111111111e-05, + "loss": 0.1141, + "step": 5690 + }, + { + "epoch": 0.3729146221786065, + "grad_norm": 0.9003360867500305, + "learning_rate": 6.332222222222223e-05, + "loss": 0.1115, + "step": 5700 + }, + { + "epoch": 0.3735688583578672, + "grad_norm": 1.0188775062561035, + "learning_rate": 6.343333333333333e-05, + "loss": 0.1199, + "step": 5710 + }, + { + "epoch": 0.3742230945371279, + "grad_norm": 0.9012176990509033, + "learning_rate": 6.354444444444445e-05, + "loss": 0.1091, + "step": 5720 + }, + { + "epoch": 0.3748773307163886, + "grad_norm": 0.977641224861145, + "learning_rate": 6.365555555555555e-05, + "loss": 0.1138, + "step": 5730 + }, + { + "epoch": 0.3755315668956493, + "grad_norm": 0.9474180340766907, + "learning_rate": 6.376666666666668e-05, + "loss": 0.1085, + "step": 5740 + }, + { + "epoch": 0.37618580307491006, + "grad_norm": 0.8891827464103699, + "learning_rate": 6.387777777777778e-05, + "loss": 0.1183, + "step": 5750 + }, + { + "epoch": 0.37684003925417076, + "grad_norm": 1.0365437269210815, + "learning_rate": 6.398888888888888e-05, + "loss": 0.1166, + "step": 5760 + }, + { + "epoch": 0.37749427543343145, + "grad_norm": 0.8813347220420837, + "learning_rate": 6.41e-05, + "loss": 0.1007, + "step": 5770 + }, + { + "epoch": 0.3781485116126922, + "grad_norm": 0.888383150100708, + "learning_rate": 6.421111111111111e-05, + "loss": 0.1133, + "step": 5780 + }, + { + "epoch": 0.3788027477919529, + "grad_norm": 1.1097888946533203, + "learning_rate": 6.432222222222223e-05, + "loss": 0.1318, + "step": 5790 + }, + { + "epoch": 0.37945698397121363, + "grad_norm": 0.928402304649353, + "learning_rate": 6.443333333333333e-05, + "loss": 0.1141, + "step": 5800 + }, + { + "epoch": 0.3801112201504743, + "grad_norm": 1.0633147954940796, + "learning_rate": 6.454444444444445e-05, + "loss": 0.1198, + "step": 5810 + }, + { + "epoch": 0.380765456329735, + "grad_norm": 1.11026132106781, + "learning_rate": 6.465555555555556e-05, + "loss": 0.1234, + "step": 5820 + }, + { + "epoch": 0.38141969250899577, + "grad_norm": 0.9502913951873779, + "learning_rate": 6.476666666666666e-05, + "loss": 0.1043, + "step": 5830 + }, + { + "epoch": 0.38207392868825646, + "grad_norm": 1.1175204515457153, + "learning_rate": 6.487777777777778e-05, + "loss": 0.1091, + "step": 5840 + }, + { + "epoch": 0.38272816486751715, + "grad_norm": 0.9505729079246521, + "learning_rate": 6.498888888888888e-05, + "loss": 0.1067, + "step": 5850 + }, + { + "epoch": 0.3833824010467779, + "grad_norm": 1.11201810836792, + "learning_rate": 6.510000000000001e-05, + "loss": 0.1123, + "step": 5860 + }, + { + "epoch": 0.3840366372260386, + "grad_norm": 1.0022423267364502, + "learning_rate": 6.521111111111111e-05, + "loss": 0.1076, + "step": 5870 + }, + { + "epoch": 0.38469087340529934, + "grad_norm": 1.1039308309555054, + "learning_rate": 6.532222222222223e-05, + "loss": 0.1086, + "step": 5880 + }, + { + "epoch": 0.38534510958456003, + "grad_norm": 0.970061182975769, + "learning_rate": 6.543333333333333e-05, + "loss": 0.1192, + "step": 5890 + }, + { + "epoch": 0.3859993457638207, + "grad_norm": 1.0175315141677856, + "learning_rate": 6.554444444444446e-05, + "loss": 0.1301, + "step": 5900 + }, + { + "epoch": 0.38665358194308147, + "grad_norm": 1.1553181409835815, + "learning_rate": 6.565555555555556e-05, + "loss": 0.1175, + "step": 5910 + }, + { + "epoch": 0.38730781812234216, + "grad_norm": 0.9421599507331848, + "learning_rate": 6.576666666666666e-05, + "loss": 0.1147, + "step": 5920 + }, + { + "epoch": 0.38796205430160285, + "grad_norm": 1.103137731552124, + "learning_rate": 6.587777777777778e-05, + "loss": 0.1268, + "step": 5930 + }, + { + "epoch": 0.3886162904808636, + "grad_norm": 1.0151023864746094, + "learning_rate": 6.598888888888889e-05, + "loss": 0.1103, + "step": 5940 + }, + { + "epoch": 0.3892705266601243, + "grad_norm": 1.017514944076538, + "learning_rate": 6.610000000000001e-05, + "loss": 0.1168, + "step": 5950 + }, + { + "epoch": 0.38992476283938504, + "grad_norm": 0.9031379222869873, + "learning_rate": 6.621111111111111e-05, + "loss": 0.1191, + "step": 5960 + }, + { + "epoch": 0.39057899901864573, + "grad_norm": 1.1167620420455933, + "learning_rate": 6.632222222222222e-05, + "loss": 0.1032, + "step": 5970 + }, + { + "epoch": 0.3912332351979064, + "grad_norm": 0.9715782403945923, + "learning_rate": 6.643333333333334e-05, + "loss": 0.1231, + "step": 5980 + }, + { + "epoch": 0.3918874713771672, + "grad_norm": 1.0662617683410645, + "learning_rate": 6.654444444444444e-05, + "loss": 0.1122, + "step": 5990 + }, + { + "epoch": 0.39254170755642787, + "grad_norm": 0.9010646343231201, + "learning_rate": 6.665555555555556e-05, + "loss": 0.1109, + "step": 6000 + }, + { + "epoch": 0.39319594373568856, + "grad_norm": 0.9664989709854126, + "learning_rate": 6.676666666666667e-05, + "loss": 0.1195, + "step": 6010 + }, + { + "epoch": 0.3938501799149493, + "grad_norm": 1.0604381561279297, + "learning_rate": 6.687777777777779e-05, + "loss": 0.1013, + "step": 6020 + }, + { + "epoch": 0.39450441609421, + "grad_norm": 1.0185221433639526, + "learning_rate": 6.698888888888889e-05, + "loss": 0.1087, + "step": 6030 + }, + { + "epoch": 0.39515865227347075, + "grad_norm": 1.0667742490768433, + "learning_rate": 6.71e-05, + "loss": 0.1108, + "step": 6040 + }, + { + "epoch": 0.39581288845273144, + "grad_norm": 0.9968790411949158, + "learning_rate": 6.721111111111112e-05, + "loss": 0.1254, + "step": 6050 + }, + { + "epoch": 0.39646712463199213, + "grad_norm": 0.9908486604690552, + "learning_rate": 6.732222222222224e-05, + "loss": 0.1098, + "step": 6060 + }, + { + "epoch": 0.3971213608112529, + "grad_norm": 1.073398470878601, + "learning_rate": 6.743333333333334e-05, + "loss": 0.1034, + "step": 6070 + }, + { + "epoch": 0.39777559699051357, + "grad_norm": 0.9481722712516785, + "learning_rate": 6.754444444444444e-05, + "loss": 0.1141, + "step": 6080 + }, + { + "epoch": 0.39842983316977426, + "grad_norm": 1.1808781623840332, + "learning_rate": 6.765555555555555e-05, + "loss": 0.113, + "step": 6090 + }, + { + "epoch": 0.399084069349035, + "grad_norm": 1.1544404029846191, + "learning_rate": 6.776666666666667e-05, + "loss": 0.1118, + "step": 6100 + }, + { + "epoch": 0.3997383055282957, + "grad_norm": 1.1222143173217773, + "learning_rate": 6.787777777777778e-05, + "loss": 0.1001, + "step": 6110 + }, + { + "epoch": 0.40039254170755645, + "grad_norm": 0.9140275716781616, + "learning_rate": 6.798888888888889e-05, + "loss": 0.1146, + "step": 6120 + }, + { + "epoch": 0.40104677788681714, + "grad_norm": 0.9539370536804199, + "learning_rate": 6.81e-05, + "loss": 0.1135, + "step": 6130 + }, + { + "epoch": 0.40170101406607783, + "grad_norm": 1.0647083520889282, + "learning_rate": 6.821111111111112e-05, + "loss": 0.1102, + "step": 6140 + }, + { + "epoch": 0.4023552502453386, + "grad_norm": 0.9687780141830444, + "learning_rate": 6.832222222222222e-05, + "loss": 0.1164, + "step": 6150 + }, + { + "epoch": 0.4030094864245993, + "grad_norm": 1.0673967599868774, + "learning_rate": 6.843333333333333e-05, + "loss": 0.1166, + "step": 6160 + }, + { + "epoch": 0.40366372260386, + "grad_norm": 1.0471779108047485, + "learning_rate": 6.854444444444445e-05, + "loss": 0.11, + "step": 6170 + }, + { + "epoch": 0.4043179587831207, + "grad_norm": 1.0822899341583252, + "learning_rate": 6.865555555555556e-05, + "loss": 0.1246, + "step": 6180 + }, + { + "epoch": 0.4049721949623814, + "grad_norm": 0.7718848586082458, + "learning_rate": 6.876666666666667e-05, + "loss": 0.1116, + "step": 6190 + }, + { + "epoch": 0.40562643114164215, + "grad_norm": 1.1657593250274658, + "learning_rate": 6.887777777777778e-05, + "loss": 0.1087, + "step": 6200 + }, + { + "epoch": 0.40628066732090284, + "grad_norm": 0.8355668783187866, + "learning_rate": 6.89888888888889e-05, + "loss": 0.1003, + "step": 6210 + }, + { + "epoch": 0.40693490350016354, + "grad_norm": 0.9746671319007874, + "learning_rate": 6.91e-05, + "loss": 0.1282, + "step": 6220 + }, + { + "epoch": 0.4075891396794243, + "grad_norm": 1.1945754289627075, + "learning_rate": 6.921111111111111e-05, + "loss": 0.1219, + "step": 6230 + }, + { + "epoch": 0.408243375858685, + "grad_norm": 1.1995118856430054, + "learning_rate": 6.932222222222222e-05, + "loss": 0.1212, + "step": 6240 + }, + { + "epoch": 0.4088976120379457, + "grad_norm": 1.1618099212646484, + "learning_rate": 6.943333333333335e-05, + "loss": 0.117, + "step": 6250 + }, + { + "epoch": 0.4095518482172064, + "grad_norm": 0.8268874883651733, + "learning_rate": 6.954444444444445e-05, + "loss": 0.1141, + "step": 6260 + }, + { + "epoch": 0.4102060843964671, + "grad_norm": 0.8862093091011047, + "learning_rate": 6.965555555555556e-05, + "loss": 0.122, + "step": 6270 + }, + { + "epoch": 0.41086032057572786, + "grad_norm": 1.2067736387252808, + "learning_rate": 6.976666666666666e-05, + "loss": 0.1089, + "step": 6280 + }, + { + "epoch": 0.41151455675498855, + "grad_norm": 1.0685234069824219, + "learning_rate": 6.987777777777779e-05, + "loss": 0.1147, + "step": 6290 + }, + { + "epoch": 0.41216879293424924, + "grad_norm": 0.9467248916625977, + "learning_rate": 6.99888888888889e-05, + "loss": 0.1282, + "step": 6300 + }, + { + "epoch": 0.41282302911351, + "grad_norm": 1.2121899127960205, + "learning_rate": 7.01e-05, + "loss": 0.1052, + "step": 6310 + }, + { + "epoch": 0.4134772652927707, + "grad_norm": 1.1744288206100464, + "learning_rate": 7.021111111111111e-05, + "loss": 0.1119, + "step": 6320 + }, + { + "epoch": 0.4141315014720314, + "grad_norm": 1.219565749168396, + "learning_rate": 7.032222222222223e-05, + "loss": 0.1203, + "step": 6330 + }, + { + "epoch": 0.4147857376512921, + "grad_norm": 0.9562080502510071, + "learning_rate": 7.043333333333334e-05, + "loss": 0.1157, + "step": 6340 + }, + { + "epoch": 0.4154399738305528, + "grad_norm": 1.188444972038269, + "learning_rate": 7.054444444444444e-05, + "loss": 0.1153, + "step": 6350 + }, + { + "epoch": 0.41609421000981356, + "grad_norm": 1.1564666032791138, + "learning_rate": 7.065555555555556e-05, + "loss": 0.1135, + "step": 6360 + }, + { + "epoch": 0.41674844618907425, + "grad_norm": 0.9498192071914673, + "learning_rate": 7.076666666666667e-05, + "loss": 0.119, + "step": 6370 + }, + { + "epoch": 0.41740268236833494, + "grad_norm": 1.0807279348373413, + "learning_rate": 7.087777777777778e-05, + "loss": 0.1176, + "step": 6380 + }, + { + "epoch": 0.4180569185475957, + "grad_norm": 1.169202446937561, + "learning_rate": 7.098888888888889e-05, + "loss": 0.1206, + "step": 6390 + }, + { + "epoch": 0.4187111547268564, + "grad_norm": 1.1617976427078247, + "learning_rate": 7.11e-05, + "loss": 0.1017, + "step": 6400 + }, + { + "epoch": 0.41936539090611713, + "grad_norm": 0.8993281126022339, + "learning_rate": 7.121111111111112e-05, + "loss": 0.1053, + "step": 6410 + }, + { + "epoch": 0.4200196270853778, + "grad_norm": 1.145491600036621, + "learning_rate": 7.132222222222222e-05, + "loss": 0.1247, + "step": 6420 + }, + { + "epoch": 0.4206738632646385, + "grad_norm": 1.1302369832992554, + "learning_rate": 7.143333333333334e-05, + "loss": 0.1165, + "step": 6430 + }, + { + "epoch": 0.42132809944389926, + "grad_norm": 1.070312261581421, + "learning_rate": 7.154444444444444e-05, + "loss": 0.112, + "step": 6440 + }, + { + "epoch": 0.42198233562315995, + "grad_norm": 1.261004090309143, + "learning_rate": 7.165555555555556e-05, + "loss": 0.1202, + "step": 6450 + }, + { + "epoch": 0.42263657180242065, + "grad_norm": 1.149681806564331, + "learning_rate": 7.176666666666667e-05, + "loss": 0.1116, + "step": 6460 + }, + { + "epoch": 0.4232908079816814, + "grad_norm": 1.0995161533355713, + "learning_rate": 7.187777777777777e-05, + "loss": 0.1068, + "step": 6470 + }, + { + "epoch": 0.4239450441609421, + "grad_norm": 1.2060518264770508, + "learning_rate": 7.198888888888889e-05, + "loss": 0.1178, + "step": 6480 + }, + { + "epoch": 0.42459928034020283, + "grad_norm": 0.885783851146698, + "learning_rate": 7.21e-05, + "loss": 0.1114, + "step": 6490 + }, + { + "epoch": 0.4252535165194635, + "grad_norm": 1.0212535858154297, + "learning_rate": 7.221111111111112e-05, + "loss": 0.102, + "step": 6500 + }, + { + "epoch": 0.4259077526987242, + "grad_norm": 0.9369866847991943, + "learning_rate": 7.232222222222222e-05, + "loss": 0.1018, + "step": 6510 + }, + { + "epoch": 0.42656198887798497, + "grad_norm": 1.102473497390747, + "learning_rate": 7.243333333333334e-05, + "loss": 0.1163, + "step": 6520 + }, + { + "epoch": 0.42721622505724566, + "grad_norm": 0.9343268871307373, + "learning_rate": 7.254444444444445e-05, + "loss": 0.1118, + "step": 6530 + }, + { + "epoch": 0.4278704612365064, + "grad_norm": 0.9922140836715698, + "learning_rate": 7.265555555555555e-05, + "loss": 0.1312, + "step": 6540 + }, + { + "epoch": 0.4285246974157671, + "grad_norm": 0.92384934425354, + "learning_rate": 7.276666666666667e-05, + "loss": 0.1152, + "step": 6550 + }, + { + "epoch": 0.4291789335950278, + "grad_norm": 0.8126742243766785, + "learning_rate": 7.287777777777778e-05, + "loss": 0.1132, + "step": 6560 + }, + { + "epoch": 0.42983316977428854, + "grad_norm": 0.8990784287452698, + "learning_rate": 7.29888888888889e-05, + "loss": 0.1172, + "step": 6570 + }, + { + "epoch": 0.43048740595354923, + "grad_norm": 0.7989428043365479, + "learning_rate": 7.31e-05, + "loss": 0.1146, + "step": 6580 + }, + { + "epoch": 0.4311416421328099, + "grad_norm": 1.0440112352371216, + "learning_rate": 7.321111111111112e-05, + "loss": 0.1257, + "step": 6590 + }, + { + "epoch": 0.43179587831207067, + "grad_norm": 0.9922072887420654, + "learning_rate": 7.332222222222223e-05, + "loss": 0.116, + "step": 6600 + }, + { + "epoch": 0.43245011449133136, + "grad_norm": 0.951362133026123, + "learning_rate": 7.343333333333333e-05, + "loss": 0.1205, + "step": 6610 + }, + { + "epoch": 0.4331043506705921, + "grad_norm": 1.096359133720398, + "learning_rate": 7.354444444444445e-05, + "loss": 0.1092, + "step": 6620 + }, + { + "epoch": 0.4337585868498528, + "grad_norm": 0.9648379683494568, + "learning_rate": 7.365555555555555e-05, + "loss": 0.1121, + "step": 6630 + }, + { + "epoch": 0.4344128230291135, + "grad_norm": 0.9251676797866821, + "learning_rate": 7.376666666666667e-05, + "loss": 0.1139, + "step": 6640 + }, + { + "epoch": 0.43506705920837424, + "grad_norm": 1.0393186807632446, + "learning_rate": 7.387777777777778e-05, + "loss": 0.1136, + "step": 6650 + }, + { + "epoch": 0.43572129538763493, + "grad_norm": 0.870494544506073, + "learning_rate": 7.39888888888889e-05, + "loss": 0.1052, + "step": 6660 + }, + { + "epoch": 0.4363755315668956, + "grad_norm": 0.9582804441452026, + "learning_rate": 7.41e-05, + "loss": 0.1218, + "step": 6670 + }, + { + "epoch": 0.4370297677461564, + "grad_norm": 1.166326642036438, + "learning_rate": 7.421111111111111e-05, + "loss": 0.1126, + "step": 6680 + }, + { + "epoch": 0.43768400392541706, + "grad_norm": 0.9711355566978455, + "learning_rate": 7.432222222222223e-05, + "loss": 0.1186, + "step": 6690 + }, + { + "epoch": 0.4383382401046778, + "grad_norm": 1.110071063041687, + "learning_rate": 7.443333333333333e-05, + "loss": 0.1117, + "step": 6700 + }, + { + "epoch": 0.4389924762839385, + "grad_norm": 0.836465060710907, + "learning_rate": 7.454444444444445e-05, + "loss": 0.1108, + "step": 6710 + }, + { + "epoch": 0.4396467124631992, + "grad_norm": 1.120985507965088, + "learning_rate": 7.465555555555556e-05, + "loss": 0.1172, + "step": 6720 + }, + { + "epoch": 0.44030094864245994, + "grad_norm": 1.0334186553955078, + "learning_rate": 7.476666666666668e-05, + "loss": 0.1142, + "step": 6730 + }, + { + "epoch": 0.44095518482172064, + "grad_norm": 1.0058670043945312, + "learning_rate": 7.487777777777778e-05, + "loss": 0.1191, + "step": 6740 + }, + { + "epoch": 0.44160942100098133, + "grad_norm": 1.1377954483032227, + "learning_rate": 7.49888888888889e-05, + "loss": 0.1083, + "step": 6750 + }, + { + "epoch": 0.4422636571802421, + "grad_norm": 1.0749398469924927, + "learning_rate": 7.510000000000001e-05, + "loss": 0.1275, + "step": 6760 + }, + { + "epoch": 0.44291789335950277, + "grad_norm": 1.0065230131149292, + "learning_rate": 7.521111111111111e-05, + "loss": 0.1192, + "step": 6770 + }, + { + "epoch": 0.4435721295387635, + "grad_norm": 1.1602001190185547, + "learning_rate": 7.532222222222223e-05, + "loss": 0.1137, + "step": 6780 + }, + { + "epoch": 0.4442263657180242, + "grad_norm": 1.0304006338119507, + "learning_rate": 7.543333333333333e-05, + "loss": 0.1206, + "step": 6790 + }, + { + "epoch": 0.4448806018972849, + "grad_norm": 1.0605281591415405, + "learning_rate": 7.554444444444446e-05, + "loss": 0.1171, + "step": 6800 + }, + { + "epoch": 0.44553483807654565, + "grad_norm": 0.8925848007202148, + "learning_rate": 7.565555555555556e-05, + "loss": 0.1091, + "step": 6810 + }, + { + "epoch": 0.44618907425580634, + "grad_norm": 0.9806022047996521, + "learning_rate": 7.576666666666667e-05, + "loss": 0.1168, + "step": 6820 + }, + { + "epoch": 0.44684331043506703, + "grad_norm": 1.2420053482055664, + "learning_rate": 7.587777777777778e-05, + "loss": 0.117, + "step": 6830 + }, + { + "epoch": 0.4474975466143278, + "grad_norm": 1.1187241077423096, + "learning_rate": 7.598888888888889e-05, + "loss": 0.1286, + "step": 6840 + }, + { + "epoch": 0.44815178279358847, + "grad_norm": 1.2027949094772339, + "learning_rate": 7.61e-05, + "loss": 0.1178, + "step": 6850 + }, + { + "epoch": 0.4488060189728492, + "grad_norm": 1.1143661737442017, + "learning_rate": 7.621111111111111e-05, + "loss": 0.115, + "step": 6860 + }, + { + "epoch": 0.4494602551521099, + "grad_norm": 0.9064378142356873, + "learning_rate": 7.632222222222222e-05, + "loss": 0.1141, + "step": 6870 + }, + { + "epoch": 0.4501144913313706, + "grad_norm": 1.2292486429214478, + "learning_rate": 7.643333333333334e-05, + "loss": 0.1234, + "step": 6880 + }, + { + "epoch": 0.45076872751063135, + "grad_norm": 1.2885360717773438, + "learning_rate": 7.654444444444445e-05, + "loss": 0.1205, + "step": 6890 + }, + { + "epoch": 0.45142296368989204, + "grad_norm": 0.9081943035125732, + "learning_rate": 7.665555555555556e-05, + "loss": 0.1226, + "step": 6900 + }, + { + "epoch": 0.4520771998691528, + "grad_norm": 1.1508632898330688, + "learning_rate": 7.676666666666667e-05, + "loss": 0.1219, + "step": 6910 + }, + { + "epoch": 0.4527314360484135, + "grad_norm": 0.971246063709259, + "learning_rate": 7.687777777777779e-05, + "loss": 0.1315, + "step": 6920 + }, + { + "epoch": 0.4533856722276742, + "grad_norm": 1.2282114028930664, + "learning_rate": 7.698888888888889e-05, + "loss": 0.1183, + "step": 6930 + }, + { + "epoch": 0.4540399084069349, + "grad_norm": 1.111584186553955, + "learning_rate": 7.71e-05, + "loss": 0.1193, + "step": 6940 + }, + { + "epoch": 0.4546941445861956, + "grad_norm": 1.0776195526123047, + "learning_rate": 7.72111111111111e-05, + "loss": 0.1157, + "step": 6950 + }, + { + "epoch": 0.4553483807654563, + "grad_norm": 1.1044821739196777, + "learning_rate": 7.732222222222223e-05, + "loss": 0.1191, + "step": 6960 + }, + { + "epoch": 0.45600261694471705, + "grad_norm": 0.9981603622436523, + "learning_rate": 7.743333333333334e-05, + "loss": 0.1231, + "step": 6970 + }, + { + "epoch": 0.45665685312397775, + "grad_norm": 1.0244512557983398, + "learning_rate": 7.754444444444445e-05, + "loss": 0.1175, + "step": 6980 + }, + { + "epoch": 0.4573110893032385, + "grad_norm": 0.9602757692337036, + "learning_rate": 7.765555555555555e-05, + "loss": 0.1095, + "step": 6990 + }, + { + "epoch": 0.4579653254824992, + "grad_norm": 1.00235116481781, + "learning_rate": 7.776666666666667e-05, + "loss": 0.1066, + "step": 7000 + }, + { + "epoch": 0.4586195616617599, + "grad_norm": 0.9261903762817383, + "learning_rate": 7.787777777777778e-05, + "loss": 0.1073, + "step": 7010 + }, + { + "epoch": 0.4592737978410206, + "grad_norm": 0.9496639370918274, + "learning_rate": 7.798888888888889e-05, + "loss": 0.1291, + "step": 7020 + }, + { + "epoch": 0.4599280340202813, + "grad_norm": 0.983691394329071, + "learning_rate": 7.81e-05, + "loss": 0.1113, + "step": 7030 + }, + { + "epoch": 0.460582270199542, + "grad_norm": 1.006894826889038, + "learning_rate": 7.821111111111112e-05, + "loss": 0.1102, + "step": 7040 + }, + { + "epoch": 0.46123650637880276, + "grad_norm": 0.907843291759491, + "learning_rate": 7.832222222222223e-05, + "loss": 0.1161, + "step": 7050 + }, + { + "epoch": 0.46189074255806345, + "grad_norm": 1.1818556785583496, + "learning_rate": 7.843333333333333e-05, + "loss": 0.1193, + "step": 7060 + }, + { + "epoch": 0.4625449787373242, + "grad_norm": 1.0127677917480469, + "learning_rate": 7.854444444444445e-05, + "loss": 0.1149, + "step": 7070 + }, + { + "epoch": 0.4631992149165849, + "grad_norm": 0.9791194796562195, + "learning_rate": 7.865555555555556e-05, + "loss": 0.1251, + "step": 7080 + }, + { + "epoch": 0.4638534510958456, + "grad_norm": 1.0158543586730957, + "learning_rate": 7.876666666666667e-05, + "loss": 0.1137, + "step": 7090 + }, + { + "epoch": 0.46450768727510633, + "grad_norm": 1.2994464635849, + "learning_rate": 7.887777777777778e-05, + "loss": 0.1277, + "step": 7100 + }, + { + "epoch": 0.465161923454367, + "grad_norm": 1.0386182069778442, + "learning_rate": 7.89888888888889e-05, + "loss": 0.1398, + "step": 7110 + }, + { + "epoch": 0.4658161596336277, + "grad_norm": 0.8721251487731934, + "learning_rate": 7.910000000000001e-05, + "loss": 0.1043, + "step": 7120 + }, + { + "epoch": 0.46647039581288846, + "grad_norm": 1.1872791051864624, + "learning_rate": 7.921111111111111e-05, + "loss": 0.1087, + "step": 7130 + }, + { + "epoch": 0.46712463199214915, + "grad_norm": 1.0702382326126099, + "learning_rate": 7.932222222222223e-05, + "loss": 0.1136, + "step": 7140 + }, + { + "epoch": 0.4677788681714099, + "grad_norm": 1.5441776514053345, + "learning_rate": 7.943333333333333e-05, + "loss": 0.1156, + "step": 7150 + }, + { + "epoch": 0.4684331043506706, + "grad_norm": 1.2157565355300903, + "learning_rate": 7.954444444444445e-05, + "loss": 0.1146, + "step": 7160 + }, + { + "epoch": 0.4690873405299313, + "grad_norm": 0.9208630323410034, + "learning_rate": 7.965555555555556e-05, + "loss": 0.1057, + "step": 7170 + }, + { + "epoch": 0.46974157670919203, + "grad_norm": 1.2033475637435913, + "learning_rate": 7.976666666666666e-05, + "loss": 0.1173, + "step": 7180 + }, + { + "epoch": 0.4703958128884527, + "grad_norm": 1.084647297859192, + "learning_rate": 7.987777777777778e-05, + "loss": 0.1103, + "step": 7190 + }, + { + "epoch": 0.47105004906771347, + "grad_norm": 1.0333460569381714, + "learning_rate": 7.99888888888889e-05, + "loss": 0.1151, + "step": 7200 + }, + { + "epoch": 0.47170428524697416, + "grad_norm": 0.7924304604530334, + "learning_rate": 8.010000000000001e-05, + "loss": 0.1198, + "step": 7210 + }, + { + "epoch": 0.47235852142623486, + "grad_norm": 0.9112861156463623, + "learning_rate": 8.021111111111111e-05, + "loss": 0.1116, + "step": 7220 + }, + { + "epoch": 0.4730127576054956, + "grad_norm": 0.8361260890960693, + "learning_rate": 8.032222222222223e-05, + "loss": 0.1169, + "step": 7230 + }, + { + "epoch": 0.4736669937847563, + "grad_norm": 1.0955017805099487, + "learning_rate": 8.043333333333334e-05, + "loss": 0.1171, + "step": 7240 + }, + { + "epoch": 0.474321229964017, + "grad_norm": 0.775754988193512, + "learning_rate": 8.054444444444444e-05, + "loss": 0.1109, + "step": 7250 + }, + { + "epoch": 0.47497546614327774, + "grad_norm": 0.9245556592941284, + "learning_rate": 8.065555555555556e-05, + "loss": 0.1185, + "step": 7260 + }, + { + "epoch": 0.47562970232253843, + "grad_norm": 0.9954939484596252, + "learning_rate": 8.076666666666667e-05, + "loss": 0.1199, + "step": 7270 + }, + { + "epoch": 0.4762839385017992, + "grad_norm": 0.9574825167655945, + "learning_rate": 8.087777777777779e-05, + "loss": 0.1154, + "step": 7280 + }, + { + "epoch": 0.47693817468105987, + "grad_norm": 1.0296094417572021, + "learning_rate": 8.098888888888889e-05, + "loss": 0.1226, + "step": 7290 + }, + { + "epoch": 0.47759241086032056, + "grad_norm": 1.061394453048706, + "learning_rate": 8.11e-05, + "loss": 0.1142, + "step": 7300 + }, + { + "epoch": 0.4782466470395813, + "grad_norm": 1.1966540813446045, + "learning_rate": 8.121111111111112e-05, + "loss": 0.1252, + "step": 7310 + }, + { + "epoch": 0.478900883218842, + "grad_norm": 1.1094386577606201, + "learning_rate": 8.132222222222222e-05, + "loss": 0.12, + "step": 7320 + }, + { + "epoch": 0.4795551193981027, + "grad_norm": 1.1346668004989624, + "learning_rate": 8.143333333333334e-05, + "loss": 0.121, + "step": 7330 + }, + { + "epoch": 0.48020935557736344, + "grad_norm": 1.08058500289917, + "learning_rate": 8.154444444444444e-05, + "loss": 0.1175, + "step": 7340 + }, + { + "epoch": 0.48086359175662413, + "grad_norm": 1.0835899114608765, + "learning_rate": 8.165555555555557e-05, + "loss": 0.1139, + "step": 7350 + }, + { + "epoch": 0.4815178279358849, + "grad_norm": 1.1137175559997559, + "learning_rate": 8.176666666666667e-05, + "loss": 0.1182, + "step": 7360 + }, + { + "epoch": 0.48217206411514557, + "grad_norm": 0.8468176126480103, + "learning_rate": 8.187777777777779e-05, + "loss": 0.1112, + "step": 7370 + }, + { + "epoch": 0.48282630029440626, + "grad_norm": 0.984216034412384, + "learning_rate": 8.198888888888889e-05, + "loss": 0.1158, + "step": 7380 + }, + { + "epoch": 0.483480536473667, + "grad_norm": 1.0112940073013306, + "learning_rate": 8.21e-05, + "loss": 0.1184, + "step": 7390 + }, + { + "epoch": 0.4841347726529277, + "grad_norm": 1.1188663244247437, + "learning_rate": 8.221111111111112e-05, + "loss": 0.1205, + "step": 7400 + }, + { + "epoch": 0.4847890088321884, + "grad_norm": 1.0517669916152954, + "learning_rate": 8.232222222222222e-05, + "loss": 0.1175, + "step": 7410 + }, + { + "epoch": 0.48544324501144914, + "grad_norm": 0.9220474362373352, + "learning_rate": 8.243333333333334e-05, + "loss": 0.1072, + "step": 7420 + }, + { + "epoch": 0.48609748119070983, + "grad_norm": 0.9419307708740234, + "learning_rate": 8.254444444444445e-05, + "loss": 0.1173, + "step": 7430 + }, + { + "epoch": 0.4867517173699706, + "grad_norm": 0.7772601842880249, + "learning_rate": 8.265555555555557e-05, + "loss": 0.113, + "step": 7440 + }, + { + "epoch": 0.4874059535492313, + "grad_norm": 0.9354446530342102, + "learning_rate": 8.276666666666667e-05, + "loss": 0.1158, + "step": 7450 + }, + { + "epoch": 0.48806018972849197, + "grad_norm": 1.0952316522598267, + "learning_rate": 8.287777777777777e-05, + "loss": 0.1092, + "step": 7460 + }, + { + "epoch": 0.4887144259077527, + "grad_norm": 0.9121578335762024, + "learning_rate": 8.29888888888889e-05, + "loss": 0.1232, + "step": 7470 + }, + { + "epoch": 0.4893686620870134, + "grad_norm": 1.195021152496338, + "learning_rate": 8.31e-05, + "loss": 0.117, + "step": 7480 + }, + { + "epoch": 0.4900228982662741, + "grad_norm": 1.1168640851974487, + "learning_rate": 8.321111111111112e-05, + "loss": 0.1202, + "step": 7490 + }, + { + "epoch": 0.49067713444553485, + "grad_norm": 1.1179988384246826, + "learning_rate": 8.332222222222222e-05, + "loss": 0.1173, + "step": 7500 + }, + { + "epoch": 0.49133137062479554, + "grad_norm": 1.026973843574524, + "learning_rate": 8.343333333333335e-05, + "loss": 0.1139, + "step": 7510 + }, + { + "epoch": 0.4919856068040563, + "grad_norm": 1.1670565605163574, + "learning_rate": 8.354444444444445e-05, + "loss": 0.12, + "step": 7520 + }, + { + "epoch": 0.492639842983317, + "grad_norm": 0.9871100187301636, + "learning_rate": 8.365555555555556e-05, + "loss": 0.1218, + "step": 7530 + }, + { + "epoch": 0.49329407916257767, + "grad_norm": 1.137722134590149, + "learning_rate": 8.376666666666667e-05, + "loss": 0.1178, + "step": 7540 + }, + { + "epoch": 0.4939483153418384, + "grad_norm": 1.0345990657806396, + "learning_rate": 8.387777777777778e-05, + "loss": 0.1226, + "step": 7550 + }, + { + "epoch": 0.4946025515210991, + "grad_norm": 1.140154480934143, + "learning_rate": 8.39888888888889e-05, + "loss": 0.1297, + "step": 7560 + }, + { + "epoch": 0.49525678770035986, + "grad_norm": 0.97530198097229, + "learning_rate": 8.41e-05, + "loss": 0.1194, + "step": 7570 + }, + { + "epoch": 0.49591102387962055, + "grad_norm": 1.1377121210098267, + "learning_rate": 8.421111111111111e-05, + "loss": 0.1295, + "step": 7580 + }, + { + "epoch": 0.49656526005888124, + "grad_norm": 1.4145655632019043, + "learning_rate": 8.432222222222223e-05, + "loss": 0.1145, + "step": 7590 + }, + { + "epoch": 0.497219496238142, + "grad_norm": 1.2647418975830078, + "learning_rate": 8.443333333333334e-05, + "loss": 0.1254, + "step": 7600 + }, + { + "epoch": 0.4978737324174027, + "grad_norm": 1.0249871015548706, + "learning_rate": 8.454444444444445e-05, + "loss": 0.1249, + "step": 7610 + }, + { + "epoch": 0.4985279685966634, + "grad_norm": 0.8847797513008118, + "learning_rate": 8.465555555555556e-05, + "loss": 0.1039, + "step": 7620 + }, + { + "epoch": 0.4991822047759241, + "grad_norm": 1.2074919939041138, + "learning_rate": 8.476666666666668e-05, + "loss": 0.1109, + "step": 7630 + }, + { + "epoch": 0.4998364409551848, + "grad_norm": 0.9957761764526367, + "learning_rate": 8.487777777777778e-05, + "loss": 0.1224, + "step": 7640 + }, + { + "epoch": 0.5004906771344455, + "grad_norm": 0.8962732553482056, + "learning_rate": 8.498888888888889e-05, + "loss": 0.1247, + "step": 7650 + }, + { + "epoch": 0.5011449133137063, + "grad_norm": 0.8505379557609558, + "learning_rate": 8.510000000000001e-05, + "loss": 0.1152, + "step": 7660 + }, + { + "epoch": 0.501799149492967, + "grad_norm": 1.0067418813705444, + "learning_rate": 8.521111111111112e-05, + "loss": 0.1152, + "step": 7670 + }, + { + "epoch": 0.5024533856722276, + "grad_norm": 0.9401522278785706, + "learning_rate": 8.532222222222223e-05, + "loss": 0.1306, + "step": 7680 + }, + { + "epoch": 0.5031076218514884, + "grad_norm": 0.9951933026313782, + "learning_rate": 8.543333333333333e-05, + "loss": 0.1126, + "step": 7690 + }, + { + "epoch": 0.5037618580307491, + "grad_norm": 1.0896522998809814, + "learning_rate": 8.554444444444444e-05, + "loss": 0.1158, + "step": 7700 + }, + { + "epoch": 0.5044160942100098, + "grad_norm": 0.9648086428642273, + "learning_rate": 8.565555555555556e-05, + "loss": 0.1115, + "step": 7710 + }, + { + "epoch": 0.5050703303892705, + "grad_norm": 1.1799818277359009, + "learning_rate": 8.576666666666667e-05, + "loss": 0.1303, + "step": 7720 + }, + { + "epoch": 0.5057245665685313, + "grad_norm": 1.237389326095581, + "learning_rate": 8.587777777777778e-05, + "loss": 0.1238, + "step": 7730 + }, + { + "epoch": 0.5063788027477919, + "grad_norm": 0.7409398555755615, + "learning_rate": 8.598888888888889e-05, + "loss": 0.1184, + "step": 7740 + }, + { + "epoch": 0.5070330389270526, + "grad_norm": 0.9186582565307617, + "learning_rate": 8.61e-05, + "loss": 0.1135, + "step": 7750 + }, + { + "epoch": 0.5076872751063134, + "grad_norm": 1.130561351776123, + "learning_rate": 8.621111111111112e-05, + "loss": 0.1213, + "step": 7760 + }, + { + "epoch": 0.5083415112855741, + "grad_norm": 1.1196184158325195, + "learning_rate": 8.632222222222222e-05, + "loss": 0.1171, + "step": 7770 + }, + { + "epoch": 0.5089957474648348, + "grad_norm": 0.9427690505981445, + "learning_rate": 8.643333333333334e-05, + "loss": 0.1145, + "step": 7780 + }, + { + "epoch": 0.5096499836440955, + "grad_norm": 0.9436381459236145, + "learning_rate": 8.654444444444445e-05, + "loss": 0.113, + "step": 7790 + }, + { + "epoch": 0.5103042198233563, + "grad_norm": 1.0007132291793823, + "learning_rate": 8.665555555555556e-05, + "loss": 0.1174, + "step": 7800 + }, + { + "epoch": 0.5109584560026169, + "grad_norm": 1.1906782388687134, + "learning_rate": 8.676666666666667e-05, + "loss": 0.1363, + "step": 7810 + }, + { + "epoch": 0.5116126921818777, + "grad_norm": 0.9627518653869629, + "learning_rate": 8.687777777777779e-05, + "loss": 0.1123, + "step": 7820 + }, + { + "epoch": 0.5122669283611384, + "grad_norm": 1.0318536758422852, + "learning_rate": 8.69888888888889e-05, + "loss": 0.1138, + "step": 7830 + }, + { + "epoch": 0.512921164540399, + "grad_norm": 0.9794100522994995, + "learning_rate": 8.71e-05, + "loss": 0.1184, + "step": 7840 + }, + { + "epoch": 0.5135754007196598, + "grad_norm": 1.2829060554504395, + "learning_rate": 8.72111111111111e-05, + "loss": 0.1206, + "step": 7850 + }, + { + "epoch": 0.5142296368989205, + "grad_norm": 0.9050964117050171, + "learning_rate": 8.732222222222223e-05, + "loss": 0.1117, + "step": 7860 + }, + { + "epoch": 0.5148838730781812, + "grad_norm": 1.2302874326705933, + "learning_rate": 8.743333333333334e-05, + "loss": 0.1118, + "step": 7870 + }, + { + "epoch": 0.5155381092574419, + "grad_norm": 1.3270689249038696, + "learning_rate": 8.754444444444445e-05, + "loss": 0.1188, + "step": 7880 + }, + { + "epoch": 0.5161923454367027, + "grad_norm": 1.0173133611679077, + "learning_rate": 8.765555555555555e-05, + "loss": 0.1187, + "step": 7890 + }, + { + "epoch": 0.5168465816159633, + "grad_norm": 1.0799261331558228, + "learning_rate": 8.776666666666668e-05, + "loss": 0.1172, + "step": 7900 + }, + { + "epoch": 0.517500817795224, + "grad_norm": 1.013592004776001, + "learning_rate": 8.787777777777778e-05, + "loss": 0.1156, + "step": 7910 + }, + { + "epoch": 0.5181550539744848, + "grad_norm": 1.0565035343170166, + "learning_rate": 8.79888888888889e-05, + "loss": 0.1271, + "step": 7920 + }, + { + "epoch": 0.5188092901537456, + "grad_norm": 0.8640610575675964, + "learning_rate": 8.81e-05, + "loss": 0.1136, + "step": 7930 + }, + { + "epoch": 0.5194635263330062, + "grad_norm": 0.942816972732544, + "learning_rate": 8.821111111111112e-05, + "loss": 0.114, + "step": 7940 + }, + { + "epoch": 0.5201177625122669, + "grad_norm": 1.1157722473144531, + "learning_rate": 8.832222222222223e-05, + "loss": 0.1148, + "step": 7950 + }, + { + "epoch": 0.5207719986915277, + "grad_norm": 1.1118143796920776, + "learning_rate": 8.843333333333333e-05, + "loss": 0.1162, + "step": 7960 + }, + { + "epoch": 0.5214262348707883, + "grad_norm": 1.1708115339279175, + "learning_rate": 8.854444444444445e-05, + "loss": 0.1273, + "step": 7970 + }, + { + "epoch": 0.5220804710500491, + "grad_norm": 1.163822054862976, + "learning_rate": 8.865555555555556e-05, + "loss": 0.1164, + "step": 7980 + }, + { + "epoch": 0.5227347072293098, + "grad_norm": 0.847159206867218, + "learning_rate": 8.876666666666668e-05, + "loss": 0.1161, + "step": 7990 + }, + { + "epoch": 0.5233889434085705, + "grad_norm": 0.9535149335861206, + "learning_rate": 8.887777777777778e-05, + "loss": 0.1193, + "step": 8000 + }, + { + "epoch": 0.5240431795878312, + "grad_norm": 0.9705567359924316, + "learning_rate": 8.898888888888888e-05, + "loss": 0.1217, + "step": 8010 + }, + { + "epoch": 0.524697415767092, + "grad_norm": 1.0848075151443481, + "learning_rate": 8.910000000000001e-05, + "loss": 0.1274, + "step": 8020 + }, + { + "epoch": 0.5253516519463526, + "grad_norm": 1.0525822639465332, + "learning_rate": 8.921111111111111e-05, + "loss": 0.1471, + "step": 8030 + }, + { + "epoch": 0.5260058881256133, + "grad_norm": 1.1972851753234863, + "learning_rate": 8.932222222222223e-05, + "loss": 0.1224, + "step": 8040 + }, + { + "epoch": 0.5266601243048741, + "grad_norm": 1.103018045425415, + "learning_rate": 8.943333333333333e-05, + "loss": 0.1312, + "step": 8050 + }, + { + "epoch": 0.5273143604841348, + "grad_norm": 1.3622784614562988, + "learning_rate": 8.954444444444446e-05, + "loss": 0.12, + "step": 8060 + }, + { + "epoch": 0.5279685966633955, + "grad_norm": 1.1434608697891235, + "learning_rate": 8.965555555555556e-05, + "loss": 0.1083, + "step": 8070 + }, + { + "epoch": 0.5286228328426562, + "grad_norm": 1.039299726486206, + "learning_rate": 8.976666666666666e-05, + "loss": 0.1276, + "step": 8080 + }, + { + "epoch": 0.529277069021917, + "grad_norm": 1.0581910610198975, + "learning_rate": 8.987777777777778e-05, + "loss": 0.1217, + "step": 8090 + }, + { + "epoch": 0.5299313052011776, + "grad_norm": 1.0329546928405762, + "learning_rate": 8.998888888888889e-05, + "loss": 0.1211, + "step": 8100 + }, + { + "epoch": 0.5305855413804383, + "grad_norm": 1.1167445182800293, + "learning_rate": 9.010000000000001e-05, + "loss": 0.1129, + "step": 8110 + }, + { + "epoch": 0.5312397775596991, + "grad_norm": 0.8185030817985535, + "learning_rate": 9.021111111111111e-05, + "loss": 0.1084, + "step": 8120 + }, + { + "epoch": 0.5318940137389597, + "grad_norm": 0.9566468000411987, + "learning_rate": 9.032222222222223e-05, + "loss": 0.117, + "step": 8130 + }, + { + "epoch": 0.5325482499182205, + "grad_norm": 1.1805304288864136, + "learning_rate": 9.043333333333334e-05, + "loss": 0.1243, + "step": 8140 + }, + { + "epoch": 0.5332024860974812, + "grad_norm": 1.2267868518829346, + "learning_rate": 9.054444444444446e-05, + "loss": 0.1197, + "step": 8150 + }, + { + "epoch": 0.5338567222767419, + "grad_norm": 1.0790307521820068, + "learning_rate": 9.065555555555556e-05, + "loss": 0.114, + "step": 8160 + }, + { + "epoch": 0.5345109584560026, + "grad_norm": 0.9891142249107361, + "learning_rate": 9.076666666666667e-05, + "loss": 0.1138, + "step": 8170 + }, + { + "epoch": 0.5351651946352634, + "grad_norm": 0.8925236463546753, + "learning_rate": 9.087777777777779e-05, + "loss": 0.1327, + "step": 8180 + }, + { + "epoch": 0.535819430814524, + "grad_norm": 1.0021766424179077, + "learning_rate": 9.098888888888889e-05, + "loss": 0.1213, + "step": 8190 + }, + { + "epoch": 0.5364736669937847, + "grad_norm": 0.9427801966667175, + "learning_rate": 9.11e-05, + "loss": 0.1073, + "step": 8200 + }, + { + "epoch": 0.5371279031730455, + "grad_norm": 1.0449068546295166, + "learning_rate": 9.121111111111112e-05, + "loss": 0.1078, + "step": 8210 + }, + { + "epoch": 0.5377821393523062, + "grad_norm": 1.049320936203003, + "learning_rate": 9.132222222222224e-05, + "loss": 0.1147, + "step": 8220 + }, + { + "epoch": 0.5384363755315669, + "grad_norm": 1.2729623317718506, + "learning_rate": 9.143333333333334e-05, + "loss": 0.1382, + "step": 8230 + }, + { + "epoch": 0.5390906117108276, + "grad_norm": 0.9762771725654602, + "learning_rate": 9.154444444444444e-05, + "loss": 0.1162, + "step": 8240 + }, + { + "epoch": 0.5397448478900884, + "grad_norm": 0.8867229223251343, + "learning_rate": 9.165555555555555e-05, + "loss": 0.1203, + "step": 8250 + }, + { + "epoch": 0.540399084069349, + "grad_norm": 0.9900445938110352, + "learning_rate": 9.176666666666667e-05, + "loss": 0.131, + "step": 8260 + }, + { + "epoch": 0.5410533202486097, + "grad_norm": 1.091282844543457, + "learning_rate": 9.187777777777779e-05, + "loss": 0.1243, + "step": 8270 + }, + { + "epoch": 0.5417075564278705, + "grad_norm": 0.931505024433136, + "learning_rate": 9.198888888888889e-05, + "loss": 0.1051, + "step": 8280 + }, + { + "epoch": 0.5423617926071311, + "grad_norm": 0.9861153960227966, + "learning_rate": 9.21e-05, + "loss": 0.1134, + "step": 8290 + }, + { + "epoch": 0.5430160287863919, + "grad_norm": 0.8883926272392273, + "learning_rate": 9.221111111111112e-05, + "loss": 0.1148, + "step": 8300 + }, + { + "epoch": 0.5436702649656526, + "grad_norm": 0.9315553903579712, + "learning_rate": 9.232222222222223e-05, + "loss": 0.1204, + "step": 8310 + }, + { + "epoch": 0.5443245011449133, + "grad_norm": 0.8906738758087158, + "learning_rate": 9.243333333333333e-05, + "loss": 0.1149, + "step": 8320 + }, + { + "epoch": 0.544978737324174, + "grad_norm": 1.268044114112854, + "learning_rate": 9.254444444444445e-05, + "loss": 0.1198, + "step": 8330 + }, + { + "epoch": 0.5456329735034348, + "grad_norm": 0.9080297946929932, + "learning_rate": 9.265555555555557e-05, + "loss": 0.1209, + "step": 8340 + }, + { + "epoch": 0.5462872096826955, + "grad_norm": 0.8843944072723389, + "learning_rate": 9.276666666666667e-05, + "loss": 0.1144, + "step": 8350 + }, + { + "epoch": 0.5469414458619561, + "grad_norm": 1.117506980895996, + "learning_rate": 9.287777777777778e-05, + "loss": 0.1256, + "step": 8360 + }, + { + "epoch": 0.5475956820412169, + "grad_norm": 0.9390103816986084, + "learning_rate": 9.29888888888889e-05, + "loss": 0.1206, + "step": 8370 + }, + { + "epoch": 0.5482499182204776, + "grad_norm": 1.0785554647445679, + "learning_rate": 9.310000000000001e-05, + "loss": 0.1265, + "step": 8380 + }, + { + "epoch": 0.5489041543997383, + "grad_norm": 1.078389048576355, + "learning_rate": 9.321111111111112e-05, + "loss": 0.1207, + "step": 8390 + }, + { + "epoch": 0.549558390578999, + "grad_norm": 0.811019241809845, + "learning_rate": 9.332222222222222e-05, + "loss": 0.1087, + "step": 8400 + }, + { + "epoch": 0.5502126267582598, + "grad_norm": 0.9671462774276733, + "learning_rate": 9.343333333333335e-05, + "loss": 0.123, + "step": 8410 + }, + { + "epoch": 0.5508668629375204, + "grad_norm": 0.9795189499855042, + "learning_rate": 9.354444444444445e-05, + "loss": 0.1119, + "step": 8420 + }, + { + "epoch": 0.5515210991167812, + "grad_norm": 1.1247087717056274, + "learning_rate": 9.365555555555556e-05, + "loss": 0.129, + "step": 8430 + }, + { + "epoch": 0.5521753352960419, + "grad_norm": 1.5826585292816162, + "learning_rate": 9.376666666666666e-05, + "loss": 0.1226, + "step": 8440 + }, + { + "epoch": 0.5528295714753025, + "grad_norm": 1.0107616186141968, + "learning_rate": 9.38777777777778e-05, + "loss": 0.1237, + "step": 8450 + }, + { + "epoch": 0.5534838076545633, + "grad_norm": 0.9465190172195435, + "learning_rate": 9.39888888888889e-05, + "loss": 0.1169, + "step": 8460 + }, + { + "epoch": 0.554138043833824, + "grad_norm": 1.0885943174362183, + "learning_rate": 9.41e-05, + "loss": 0.1235, + "step": 8470 + }, + { + "epoch": 0.5547922800130847, + "grad_norm": 1.2744386196136475, + "learning_rate": 9.421111111111111e-05, + "loss": 0.1282, + "step": 8480 + }, + { + "epoch": 0.5554465161923454, + "grad_norm": 1.011501669883728, + "learning_rate": 9.432222222222223e-05, + "loss": 0.1157, + "step": 8490 + }, + { + "epoch": 0.5561007523716062, + "grad_norm": 0.9409123063087463, + "learning_rate": 9.443333333333334e-05, + "loss": 0.1171, + "step": 8500 + }, + { + "epoch": 0.5567549885508669, + "grad_norm": 0.8566303253173828, + "learning_rate": 9.454444444444444e-05, + "loss": 0.1134, + "step": 8510 + }, + { + "epoch": 0.5574092247301276, + "grad_norm": 1.0621939897537231, + "learning_rate": 9.465555555555556e-05, + "loss": 0.1213, + "step": 8520 + }, + { + "epoch": 0.5580634609093883, + "grad_norm": 0.7857542037963867, + "learning_rate": 9.476666666666668e-05, + "loss": 0.1287, + "step": 8530 + }, + { + "epoch": 0.558717697088649, + "grad_norm": 1.0616250038146973, + "learning_rate": 9.487777777777779e-05, + "loss": 0.1235, + "step": 8540 + }, + { + "epoch": 0.5593719332679097, + "grad_norm": 1.1893281936645508, + "learning_rate": 9.498888888888889e-05, + "loss": 0.1181, + "step": 8550 + }, + { + "epoch": 0.5600261694471704, + "grad_norm": 1.0680195093154907, + "learning_rate": 9.51e-05, + "loss": 0.1144, + "step": 8560 + }, + { + "epoch": 0.5606804056264312, + "grad_norm": 1.200814962387085, + "learning_rate": 9.521111111111112e-05, + "loss": 0.1286, + "step": 8570 + }, + { + "epoch": 0.5613346418056918, + "grad_norm": 1.1205694675445557, + "learning_rate": 9.532222222222222e-05, + "loss": 0.1192, + "step": 8580 + }, + { + "epoch": 0.5619888779849526, + "grad_norm": 0.915143609046936, + "learning_rate": 9.543333333333334e-05, + "loss": 0.1227, + "step": 8590 + }, + { + "epoch": 0.5626431141642133, + "grad_norm": 0.9382200241088867, + "learning_rate": 9.554444444444444e-05, + "loss": 0.1272, + "step": 8600 + }, + { + "epoch": 0.563297350343474, + "grad_norm": 0.8184203505516052, + "learning_rate": 9.565555555555557e-05, + "loss": 0.1188, + "step": 8610 + }, + { + "epoch": 0.5639515865227347, + "grad_norm": 0.9451429843902588, + "learning_rate": 9.576666666666667e-05, + "loss": 0.1126, + "step": 8620 + }, + { + "epoch": 0.5646058227019954, + "grad_norm": 0.9278766512870789, + "learning_rate": 9.587777777777777e-05, + "loss": 0.113, + "step": 8630 + }, + { + "epoch": 0.5652600588812562, + "grad_norm": 0.9145293831825256, + "learning_rate": 9.598888888888889e-05, + "loss": 0.1229, + "step": 8640 + }, + { + "epoch": 0.5659142950605168, + "grad_norm": 0.8851084113121033, + "learning_rate": 9.61e-05, + "loss": 0.1117, + "step": 8650 + }, + { + "epoch": 0.5665685312397776, + "grad_norm": 0.9469485878944397, + "learning_rate": 9.621111111111112e-05, + "loss": 0.1198, + "step": 8660 + }, + { + "epoch": 0.5672227674190383, + "grad_norm": 0.9359365105628967, + "learning_rate": 9.632222222222222e-05, + "loss": 0.1169, + "step": 8670 + }, + { + "epoch": 0.567877003598299, + "grad_norm": 1.2340917587280273, + "learning_rate": 9.643333333333334e-05, + "loss": 0.1279, + "step": 8680 + }, + { + "epoch": 0.5685312397775597, + "grad_norm": 0.9072956442832947, + "learning_rate": 9.654444444444445e-05, + "loss": 0.1193, + "step": 8690 + }, + { + "epoch": 0.5691854759568205, + "grad_norm": 0.9480854868888855, + "learning_rate": 9.665555555555555e-05, + "loss": 0.1143, + "step": 8700 + }, + { + "epoch": 0.5698397121360811, + "grad_norm": 0.9658315181732178, + "learning_rate": 9.676666666666667e-05, + "loss": 0.125, + "step": 8710 + }, + { + "epoch": 0.5704939483153418, + "grad_norm": 1.0352250337600708, + "learning_rate": 9.687777777777778e-05, + "loss": 0.1239, + "step": 8720 + }, + { + "epoch": 0.5711481844946026, + "grad_norm": 1.0940804481506348, + "learning_rate": 9.69888888888889e-05, + "loss": 0.1283, + "step": 8730 + }, + { + "epoch": 0.5718024206738632, + "grad_norm": 0.9814597964286804, + "learning_rate": 9.71e-05, + "loss": 0.1155, + "step": 8740 + }, + { + "epoch": 0.572456656853124, + "grad_norm": 1.007461667060852, + "learning_rate": 9.721111111111112e-05, + "loss": 0.122, + "step": 8750 + }, + { + "epoch": 0.5731108930323847, + "grad_norm": 0.9953462481498718, + "learning_rate": 9.732222222222222e-05, + "loss": 0.1322, + "step": 8760 + }, + { + "epoch": 0.5737651292116454, + "grad_norm": 0.9330369830131531, + "learning_rate": 9.743333333333335e-05, + "loss": 0.1129, + "step": 8770 + }, + { + "epoch": 0.5744193653909061, + "grad_norm": 0.9782475233078003, + "learning_rate": 9.754444444444445e-05, + "loss": 0.142, + "step": 8780 + }, + { + "epoch": 0.5750736015701668, + "grad_norm": 1.0962268114089966, + "learning_rate": 9.765555555555555e-05, + "loss": 0.1143, + "step": 8790 + }, + { + "epoch": 0.5757278377494276, + "grad_norm": 1.130919337272644, + "learning_rate": 9.776666666666667e-05, + "loss": 0.1282, + "step": 8800 + }, + { + "epoch": 0.5763820739286882, + "grad_norm": 0.9477802515029907, + "learning_rate": 9.787777777777778e-05, + "loss": 0.12, + "step": 8810 + }, + { + "epoch": 0.577036310107949, + "grad_norm": 1.0506799221038818, + "learning_rate": 9.79888888888889e-05, + "loss": 0.1187, + "step": 8820 + }, + { + "epoch": 0.5776905462872097, + "grad_norm": 0.8452275991439819, + "learning_rate": 9.81e-05, + "loss": 0.1176, + "step": 8830 + }, + { + "epoch": 0.5783447824664704, + "grad_norm": 1.2657310962677002, + "learning_rate": 9.821111111111111e-05, + "loss": 0.1319, + "step": 8840 + }, + { + "epoch": 0.5789990186457311, + "grad_norm": 0.8468209505081177, + "learning_rate": 9.832222222222223e-05, + "loss": 0.1193, + "step": 8850 + }, + { + "epoch": 0.5796532548249919, + "grad_norm": 1.0563205480575562, + "learning_rate": 9.843333333333333e-05, + "loss": 0.127, + "step": 8860 + }, + { + "epoch": 0.5803074910042525, + "grad_norm": 0.9747239351272583, + "learning_rate": 9.854444444444445e-05, + "loss": 0.1217, + "step": 8870 + }, + { + "epoch": 0.5809617271835132, + "grad_norm": 1.1098183393478394, + "learning_rate": 9.865555555555556e-05, + "loss": 0.1313, + "step": 8880 + }, + { + "epoch": 0.581615963362774, + "grad_norm": 1.0163518190383911, + "learning_rate": 9.876666666666668e-05, + "loss": 0.1259, + "step": 8890 + }, + { + "epoch": 0.5822701995420346, + "grad_norm": 1.0436562299728394, + "learning_rate": 9.887777777777778e-05, + "loss": 0.1302, + "step": 8900 + }, + { + "epoch": 0.5829244357212954, + "grad_norm": 0.9520934820175171, + "learning_rate": 9.89888888888889e-05, + "loss": 0.1343, + "step": 8910 + }, + { + "epoch": 0.5835786719005561, + "grad_norm": 1.1144458055496216, + "learning_rate": 9.910000000000001e-05, + "loss": 0.1188, + "step": 8920 + }, + { + "epoch": 0.5842329080798168, + "grad_norm": 0.8635526299476624, + "learning_rate": 9.921111111111113e-05, + "loss": 0.1263, + "step": 8930 + }, + { + "epoch": 0.5848871442590775, + "grad_norm": 0.9793311357498169, + "learning_rate": 9.932222222222223e-05, + "loss": 0.1183, + "step": 8940 + }, + { + "epoch": 0.5855413804383383, + "grad_norm": 1.0002834796905518, + "learning_rate": 9.943333333333333e-05, + "loss": 0.1212, + "step": 8950 + }, + { + "epoch": 0.586195616617599, + "grad_norm": 1.0948991775512695, + "learning_rate": 9.954444444444446e-05, + "loss": 0.1257, + "step": 8960 + }, + { + "epoch": 0.5868498527968596, + "grad_norm": 1.0232096910476685, + "learning_rate": 9.965555555555556e-05, + "loss": 0.1238, + "step": 8970 + }, + { + "epoch": 0.5875040889761204, + "grad_norm": 0.8366304039955139, + "learning_rate": 9.976666666666667e-05, + "loss": 0.1203, + "step": 8980 + }, + { + "epoch": 0.5881583251553811, + "grad_norm": 1.152336835861206, + "learning_rate": 9.987777777777778e-05, + "loss": 0.1273, + "step": 8990 + }, + { + "epoch": 0.5888125613346418, + "grad_norm": 0.9613927006721497, + "learning_rate": 9.998888888888889e-05, + "loss": 0.1119, + "step": 9000 + }, + { + "epoch": 0.5894667975139025, + "grad_norm": 0.9294191598892212, + "learning_rate": 9.99999993165094e-05, + "loss": 0.1197, + "step": 9010 + }, + { + "epoch": 0.5901210336931633, + "grad_norm": 1.0190068483352661, + "learning_rate": 9.999999695382584e-05, + "loss": 0.119, + "step": 9020 + }, + { + "epoch": 0.5907752698724239, + "grad_norm": 0.9870291948318481, + "learning_rate": 9.999999290351126e-05, + "loss": 0.1316, + "step": 9030 + }, + { + "epoch": 0.5914295060516847, + "grad_norm": 1.160510778427124, + "learning_rate": 9.999998716556578e-05, + "loss": 0.1278, + "step": 9040 + }, + { + "epoch": 0.5920837422309454, + "grad_norm": 0.9017443656921387, + "learning_rate": 9.99999797399896e-05, + "loss": 0.1154, + "step": 9050 + }, + { + "epoch": 0.592737978410206, + "grad_norm": 1.122379183769226, + "learning_rate": 9.999997062678298e-05, + "loss": 0.1227, + "step": 9060 + }, + { + "epoch": 0.5933922145894668, + "grad_norm": 0.9526849985122681, + "learning_rate": 9.999995982594624e-05, + "loss": 0.1196, + "step": 9070 + }, + { + "epoch": 0.5940464507687275, + "grad_norm": 1.065383791923523, + "learning_rate": 9.999994733747969e-05, + "loss": 0.1252, + "step": 9080 + }, + { + "epoch": 0.5947006869479883, + "grad_norm": 1.0271326303482056, + "learning_rate": 9.99999331613838e-05, + "loss": 0.1256, + "step": 9090 + }, + { + "epoch": 0.5953549231272489, + "grad_norm": 0.8906413316726685, + "learning_rate": 9.999991729765906e-05, + "loss": 0.1383, + "step": 9100 + }, + { + "epoch": 0.5960091593065097, + "grad_norm": 1.0225988626480103, + "learning_rate": 9.999989974630596e-05, + "loss": 0.1247, + "step": 9110 + }, + { + "epoch": 0.5966633954857704, + "grad_norm": 0.9085574746131897, + "learning_rate": 9.999988050732512e-05, + "loss": 0.1344, + "step": 9120 + }, + { + "epoch": 0.597317631665031, + "grad_norm": 0.9044840931892395, + "learning_rate": 9.999985958071718e-05, + "loss": 0.1308, + "step": 9130 + }, + { + "epoch": 0.5979718678442918, + "grad_norm": 0.9104439616203308, + "learning_rate": 9.999983696648286e-05, + "loss": 0.1253, + "step": 9140 + }, + { + "epoch": 0.5986261040235525, + "grad_norm": 0.998406708240509, + "learning_rate": 9.99998126646229e-05, + "loss": 0.1134, + "step": 9150 + }, + { + "epoch": 0.5992803402028132, + "grad_norm": 1.0714468955993652, + "learning_rate": 9.999978667513815e-05, + "loss": 0.1238, + "step": 9160 + }, + { + "epoch": 0.5999345763820739, + "grad_norm": 0.9690518975257874, + "learning_rate": 9.999975899802944e-05, + "loss": 0.1192, + "step": 9170 + }, + { + "epoch": 0.6005888125613347, + "grad_norm": 0.9514327645301819, + "learning_rate": 9.999972963329775e-05, + "loss": 0.1211, + "step": 9180 + }, + { + "epoch": 0.6012430487405953, + "grad_norm": 0.9661425948143005, + "learning_rate": 9.999969858094407e-05, + "loss": 0.1241, + "step": 9190 + }, + { + "epoch": 0.6018972849198561, + "grad_norm": 1.0832891464233398, + "learning_rate": 9.999966584096941e-05, + "loss": 0.1271, + "step": 9200 + }, + { + "epoch": 0.6025515210991168, + "grad_norm": 1.053916573524475, + "learning_rate": 9.999963141337492e-05, + "loss": 0.1342, + "step": 9210 + }, + { + "epoch": 0.6032057572783774, + "grad_norm": 0.9167242646217346, + "learning_rate": 9.999959529816173e-05, + "loss": 0.1221, + "step": 9220 + }, + { + "epoch": 0.6038599934576382, + "grad_norm": 0.8978278636932373, + "learning_rate": 9.999955749533107e-05, + "loss": 0.1217, + "step": 9230 + }, + { + "epoch": 0.6045142296368989, + "grad_norm": 0.9800412654876709, + "learning_rate": 9.999951800488422e-05, + "loss": 0.1178, + "step": 9240 + }, + { + "epoch": 0.6051684658161597, + "grad_norm": 0.9557292461395264, + "learning_rate": 9.999947682682251e-05, + "loss": 0.1342, + "step": 9250 + }, + { + "epoch": 0.6058227019954203, + "grad_norm": 0.9261611104011536, + "learning_rate": 9.999943396114732e-05, + "loss": 0.1208, + "step": 9260 + }, + { + "epoch": 0.6064769381746811, + "grad_norm": 0.8848803639411926, + "learning_rate": 9.999938940786011e-05, + "loss": 0.1287, + "step": 9270 + }, + { + "epoch": 0.6071311743539418, + "grad_norm": 0.818735659122467, + "learning_rate": 9.999934316696238e-05, + "loss": 0.1135, + "step": 9280 + }, + { + "epoch": 0.6077854105332025, + "grad_norm": 1.051778793334961, + "learning_rate": 9.99992952384557e-05, + "loss": 0.1175, + "step": 9290 + }, + { + "epoch": 0.6084396467124632, + "grad_norm": 0.9308475255966187, + "learning_rate": 9.999924562234167e-05, + "loss": 0.1114, + "step": 9300 + }, + { + "epoch": 0.609093882891724, + "grad_norm": 1.1484242677688599, + "learning_rate": 9.999919431862197e-05, + "loss": 0.1254, + "step": 9310 + }, + { + "epoch": 0.6097481190709846, + "grad_norm": 0.9219973683357239, + "learning_rate": 9.999914132729832e-05, + "loss": 0.1187, + "step": 9320 + }, + { + "epoch": 0.6104023552502453, + "grad_norm": 0.9428659677505493, + "learning_rate": 9.999908664837255e-05, + "loss": 0.1182, + "step": 9330 + }, + { + "epoch": 0.6110565914295061, + "grad_norm": 0.8808643221855164, + "learning_rate": 9.999903028184646e-05, + "loss": 0.1195, + "step": 9340 + }, + { + "epoch": 0.6117108276087667, + "grad_norm": 1.1182430982589722, + "learning_rate": 9.999897222772198e-05, + "loss": 0.1329, + "step": 9350 + }, + { + "epoch": 0.6123650637880275, + "grad_norm": 1.0213252305984497, + "learning_rate": 9.999891248600107e-05, + "loss": 0.1229, + "step": 9360 + }, + { + "epoch": 0.6130192999672882, + "grad_norm": 1.048762321472168, + "learning_rate": 9.999885105668571e-05, + "loss": 0.1213, + "step": 9370 + }, + { + "epoch": 0.613673536146549, + "grad_norm": 0.9131613969802856, + "learning_rate": 9.999878793977801e-05, + "loss": 0.1153, + "step": 9380 + }, + { + "epoch": 0.6143277723258096, + "grad_norm": 1.397139310836792, + "learning_rate": 9.999872313528009e-05, + "loss": 0.1203, + "step": 9390 + }, + { + "epoch": 0.6149820085050703, + "grad_norm": 1.120139479637146, + "learning_rate": 9.999865664319414e-05, + "loss": 0.1216, + "step": 9400 + }, + { + "epoch": 0.6156362446843311, + "grad_norm": 1.1770591735839844, + "learning_rate": 9.999858846352242e-05, + "loss": 0.1135, + "step": 9410 + }, + { + "epoch": 0.6162904808635917, + "grad_norm": 1.0336472988128662, + "learning_rate": 9.99985185962672e-05, + "loss": 0.1197, + "step": 9420 + }, + { + "epoch": 0.6169447170428525, + "grad_norm": 1.1690295934677124, + "learning_rate": 9.999844704143084e-05, + "loss": 0.1329, + "step": 9430 + }, + { + "epoch": 0.6175989532221132, + "grad_norm": 0.9790360331535339, + "learning_rate": 9.999837379901578e-05, + "loss": 0.114, + "step": 9440 + }, + { + "epoch": 0.6182531894013739, + "grad_norm": 0.918538510799408, + "learning_rate": 9.999829886902446e-05, + "loss": 0.1204, + "step": 9450 + }, + { + "epoch": 0.6189074255806346, + "grad_norm": 0.986890435218811, + "learning_rate": 9.999822225145945e-05, + "loss": 0.1291, + "step": 9460 + }, + { + "epoch": 0.6195616617598954, + "grad_norm": 0.8274875283241272, + "learning_rate": 9.99981439463233e-05, + "loss": 0.126, + "step": 9470 + }, + { + "epoch": 0.620215897939156, + "grad_norm": 0.8591904640197754, + "learning_rate": 9.999806395361867e-05, + "loss": 0.1184, + "step": 9480 + }, + { + "epoch": 0.6208701341184167, + "grad_norm": 0.9025493860244751, + "learning_rate": 9.999798227334827e-05, + "loss": 0.1169, + "step": 9490 + }, + { + "epoch": 0.6215243702976775, + "grad_norm": 0.9297366142272949, + "learning_rate": 9.999789890551483e-05, + "loss": 0.1313, + "step": 9500 + }, + { + "epoch": 0.6221786064769381, + "grad_norm": 0.843330979347229, + "learning_rate": 9.999781385012116e-05, + "loss": 0.1178, + "step": 9510 + }, + { + "epoch": 0.6228328426561989, + "grad_norm": 0.8769571781158447, + "learning_rate": 9.999772710717018e-05, + "loss": 0.1191, + "step": 9520 + }, + { + "epoch": 0.6234870788354596, + "grad_norm": 0.996464192867279, + "learning_rate": 9.999763867666479e-05, + "loss": 0.1184, + "step": 9530 + }, + { + "epoch": 0.6241413150147204, + "grad_norm": 0.9058352112770081, + "learning_rate": 9.999754855860795e-05, + "loss": 0.1246, + "step": 9540 + }, + { + "epoch": 0.624795551193981, + "grad_norm": 0.8626279830932617, + "learning_rate": 9.999745675300271e-05, + "loss": 0.1175, + "step": 9550 + }, + { + "epoch": 0.6254497873732418, + "grad_norm": 0.8973715901374817, + "learning_rate": 9.999736325985221e-05, + "loss": 0.1085, + "step": 9560 + }, + { + "epoch": 0.6261040235525025, + "grad_norm": 0.9920212030410767, + "learning_rate": 9.999726807915956e-05, + "loss": 0.1203, + "step": 9570 + }, + { + "epoch": 0.6267582597317631, + "grad_norm": 0.8262133598327637, + "learning_rate": 9.999717121092802e-05, + "loss": 0.1214, + "step": 9580 + }, + { + "epoch": 0.6274124959110239, + "grad_norm": 0.7645416259765625, + "learning_rate": 9.999707265516079e-05, + "loss": 0.1211, + "step": 9590 + }, + { + "epoch": 0.6280667320902846, + "grad_norm": 1.0003942251205444, + "learning_rate": 9.999697241186126e-05, + "loss": 0.1167, + "step": 9600 + }, + { + "epoch": 0.6287209682695453, + "grad_norm": 1.0682168006896973, + "learning_rate": 9.999687048103278e-05, + "loss": 0.1182, + "step": 9610 + }, + { + "epoch": 0.629375204448806, + "grad_norm": 0.7786453366279602, + "learning_rate": 9.999676686267881e-05, + "loss": 0.1241, + "step": 9620 + }, + { + "epoch": 0.6300294406280668, + "grad_norm": 0.9212088584899902, + "learning_rate": 9.999666155680281e-05, + "loss": 0.1219, + "step": 9630 + }, + { + "epoch": 0.6306836768073274, + "grad_norm": 0.8398849964141846, + "learning_rate": 9.999655456340839e-05, + "loss": 0.1217, + "step": 9640 + }, + { + "epoch": 0.6313379129865881, + "grad_norm": 0.7940874099731445, + "learning_rate": 9.999644588249912e-05, + "loss": 0.1156, + "step": 9650 + }, + { + "epoch": 0.6319921491658489, + "grad_norm": 0.9876879453659058, + "learning_rate": 9.999633551407867e-05, + "loss": 0.1307, + "step": 9660 + }, + { + "epoch": 0.6326463853451095, + "grad_norm": 0.9693999886512756, + "learning_rate": 9.999622345815081e-05, + "loss": 0.1232, + "step": 9670 + }, + { + "epoch": 0.6333006215243703, + "grad_norm": 0.8915924429893494, + "learning_rate": 9.999610971471925e-05, + "loss": 0.1255, + "step": 9680 + }, + { + "epoch": 0.633954857703631, + "grad_norm": 0.9316652417182922, + "learning_rate": 9.999599428378789e-05, + "loss": 0.1258, + "step": 9690 + }, + { + "epoch": 0.6346090938828918, + "grad_norm": 0.8917170763015747, + "learning_rate": 9.99958771653606e-05, + "loss": 0.1246, + "step": 9700 + }, + { + "epoch": 0.6352633300621524, + "grad_norm": 1.0505290031433105, + "learning_rate": 9.999575835944133e-05, + "loss": 0.1109, + "step": 9710 + }, + { + "epoch": 0.6359175662414132, + "grad_norm": 1.0103509426116943, + "learning_rate": 9.999563786603412e-05, + "loss": 0.125, + "step": 9720 + }, + { + "epoch": 0.6365718024206739, + "grad_norm": 0.8918554186820984, + "learning_rate": 9.999551568514298e-05, + "loss": 0.1281, + "step": 9730 + }, + { + "epoch": 0.6372260385999345, + "grad_norm": 0.8537850975990295, + "learning_rate": 9.999539181677208e-05, + "loss": 0.1301, + "step": 9740 + }, + { + "epoch": 0.6378802747791953, + "grad_norm": 1.0393998622894287, + "learning_rate": 9.99952662609256e-05, + "loss": 0.1183, + "step": 9750 + }, + { + "epoch": 0.638534510958456, + "grad_norm": 0.9914324283599854, + "learning_rate": 9.999513901760775e-05, + "loss": 0.1348, + "step": 9760 + }, + { + "epoch": 0.6391887471377167, + "grad_norm": 1.1659291982650757, + "learning_rate": 9.999501008682286e-05, + "loss": 0.124, + "step": 9770 + }, + { + "epoch": 0.6398429833169774, + "grad_norm": 0.9629908800125122, + "learning_rate": 9.999487946857525e-05, + "loss": 0.1177, + "step": 9780 + }, + { + "epoch": 0.6404972194962382, + "grad_norm": 0.8898019194602966, + "learning_rate": 9.999474716286934e-05, + "loss": 0.1233, + "step": 9790 + }, + { + "epoch": 0.6411514556754988, + "grad_norm": 0.9645368456840515, + "learning_rate": 9.99946131697096e-05, + "loss": 0.1172, + "step": 9800 + }, + { + "epoch": 0.6418056918547596, + "grad_norm": 1.1151118278503418, + "learning_rate": 9.999447748910056e-05, + "loss": 0.1154, + "step": 9810 + }, + { + "epoch": 0.6424599280340203, + "grad_norm": 1.1241856813430786, + "learning_rate": 9.999434012104678e-05, + "loss": 0.1182, + "step": 9820 + }, + { + "epoch": 0.643114164213281, + "grad_norm": 0.883420467376709, + "learning_rate": 9.999420106555291e-05, + "loss": 0.126, + "step": 9830 + }, + { + "epoch": 0.6437684003925417, + "grad_norm": 1.023460865020752, + "learning_rate": 9.999406032262362e-05, + "loss": 0.1242, + "step": 9840 + }, + { + "epoch": 0.6444226365718024, + "grad_norm": 1.0635135173797607, + "learning_rate": 9.99939178922637e-05, + "loss": 0.1383, + "step": 9850 + }, + { + "epoch": 0.6450768727510632, + "grad_norm": 0.738158643245697, + "learning_rate": 9.999377377447794e-05, + "loss": 0.1199, + "step": 9860 + }, + { + "epoch": 0.6457311089303238, + "grad_norm": 1.1745080947875977, + "learning_rate": 9.999362796927119e-05, + "loss": 0.1239, + "step": 9870 + }, + { + "epoch": 0.6463853451095846, + "grad_norm": 1.2103548049926758, + "learning_rate": 9.999348047664838e-05, + "loss": 0.1281, + "step": 9880 + }, + { + "epoch": 0.6470395812888453, + "grad_norm": 0.9273943901062012, + "learning_rate": 9.999333129661451e-05, + "loss": 0.1274, + "step": 9890 + }, + { + "epoch": 0.647693817468106, + "grad_norm": 0.9063096642494202, + "learning_rate": 9.999318042917459e-05, + "loss": 0.1161, + "step": 9900 + }, + { + "epoch": 0.6483480536473667, + "grad_norm": 1.0374435186386108, + "learning_rate": 9.999302787433372e-05, + "loss": 0.1253, + "step": 9910 + }, + { + "epoch": 0.6490022898266274, + "grad_norm": 0.8295223116874695, + "learning_rate": 9.999287363209703e-05, + "loss": 0.1196, + "step": 9920 + }, + { + "epoch": 0.6496565260058881, + "grad_norm": 0.9288245439529419, + "learning_rate": 9.999271770246975e-05, + "loss": 0.1272, + "step": 9930 + }, + { + "epoch": 0.6503107621851488, + "grad_norm": 1.091799259185791, + "learning_rate": 9.999256008545714e-05, + "loss": 0.1181, + "step": 9940 + }, + { + "epoch": 0.6509649983644096, + "grad_norm": 0.8960475325584412, + "learning_rate": 9.999240078106452e-05, + "loss": 0.1307, + "step": 9950 + }, + { + "epoch": 0.6516192345436702, + "grad_norm": 0.9426436424255371, + "learning_rate": 9.999223978929727e-05, + "loss": 0.116, + "step": 9960 + }, + { + "epoch": 0.652273470722931, + "grad_norm": 0.9542483687400818, + "learning_rate": 9.999207711016081e-05, + "loss": 0.1197, + "step": 9970 + }, + { + "epoch": 0.6529277069021917, + "grad_norm": 0.8808081746101379, + "learning_rate": 9.999191274366064e-05, + "loss": 0.1261, + "step": 9980 + }, + { + "epoch": 0.6535819430814525, + "grad_norm": 0.8422145247459412, + "learning_rate": 9.99917466898023e-05, + "loss": 0.112, + "step": 9990 + }, + { + "epoch": 0.6542361792607131, + "grad_norm": 0.956287682056427, + "learning_rate": 9.999157894859142e-05, + "loss": 0.1356, + "step": 10000 + }, + { + "epoch": 0.6548904154399738, + "grad_norm": 0.9550445079803467, + "learning_rate": 9.99914095200336e-05, + "loss": 0.1277, + "step": 10010 + }, + { + "epoch": 0.6555446516192346, + "grad_norm": 0.9725729823112488, + "learning_rate": 9.999123840413465e-05, + "loss": 0.1257, + "step": 10020 + }, + { + "epoch": 0.6561988877984952, + "grad_norm": 0.8097350001335144, + "learning_rate": 9.999106560090028e-05, + "loss": 0.1199, + "step": 10030 + }, + { + "epoch": 0.656853123977756, + "grad_norm": 0.9223546981811523, + "learning_rate": 9.999089111033633e-05, + "loss": 0.1223, + "step": 10040 + }, + { + "epoch": 0.6575073601570167, + "grad_norm": 0.908068060874939, + "learning_rate": 9.99907149324487e-05, + "loss": 0.1124, + "step": 10050 + }, + { + "epoch": 0.6581615963362774, + "grad_norm": 0.9873263835906982, + "learning_rate": 9.999053706724335e-05, + "loss": 0.1229, + "step": 10060 + }, + { + "epoch": 0.6588158325155381, + "grad_norm": 0.870583713054657, + "learning_rate": 9.999035751472625e-05, + "loss": 0.1113, + "step": 10070 + }, + { + "epoch": 0.6594700686947988, + "grad_norm": 0.9837010502815247, + "learning_rate": 9.99901762749035e-05, + "loss": 0.1281, + "step": 10080 + }, + { + "epoch": 0.6601243048740595, + "grad_norm": 1.0344374179840088, + "learning_rate": 9.998999334778118e-05, + "loss": 0.1355, + "step": 10090 + }, + { + "epoch": 0.6607785410533202, + "grad_norm": 1.1033861637115479, + "learning_rate": 9.99898087333655e-05, + "loss": 0.1213, + "step": 10100 + }, + { + "epoch": 0.661432777232581, + "grad_norm": 0.9438520073890686, + "learning_rate": 9.998962243166266e-05, + "loss": 0.1358, + "step": 10110 + }, + { + "epoch": 0.6620870134118417, + "grad_norm": 0.9945731163024902, + "learning_rate": 9.998943444267896e-05, + "loss": 0.1261, + "step": 10120 + }, + { + "epoch": 0.6627412495911024, + "grad_norm": 0.9855345487594604, + "learning_rate": 9.998924476642074e-05, + "loss": 0.1285, + "step": 10130 + }, + { + "epoch": 0.6633954857703631, + "grad_norm": 1.0155123472213745, + "learning_rate": 9.998905340289442e-05, + "loss": 0.1156, + "step": 10140 + }, + { + "epoch": 0.6640497219496239, + "grad_norm": 0.953682541847229, + "learning_rate": 9.998886035210643e-05, + "loss": 0.1144, + "step": 10150 + }, + { + "epoch": 0.6647039581288845, + "grad_norm": 1.0266921520233154, + "learning_rate": 9.99886656140633e-05, + "loss": 0.1202, + "step": 10160 + }, + { + "epoch": 0.6653581943081452, + "grad_norm": 0.8357493281364441, + "learning_rate": 9.998846918877162e-05, + "loss": 0.1136, + "step": 10170 + }, + { + "epoch": 0.666012430487406, + "grad_norm": 0.9822357892990112, + "learning_rate": 9.9988271076238e-05, + "loss": 0.1327, + "step": 10180 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.8407633304595947, + "learning_rate": 9.998807127646915e-05, + "loss": 0.1299, + "step": 10190 + }, + { + "epoch": 0.6673209028459274, + "grad_norm": 0.8804751038551331, + "learning_rate": 9.998786978947177e-05, + "loss": 0.1204, + "step": 10200 + }, + { + "epoch": 0.6679751390251881, + "grad_norm": 1.0064154863357544, + "learning_rate": 9.99876666152527e-05, + "loss": 0.1181, + "step": 10210 + }, + { + "epoch": 0.6686293752044488, + "grad_norm": 1.158360481262207, + "learning_rate": 9.998746175381879e-05, + "loss": 0.1163, + "step": 10220 + }, + { + "epoch": 0.6692836113837095, + "grad_norm": 1.060184359550476, + "learning_rate": 9.998725520517693e-05, + "loss": 0.1213, + "step": 10230 + }, + { + "epoch": 0.6699378475629703, + "grad_norm": 1.123124122619629, + "learning_rate": 9.998704696933413e-05, + "loss": 0.1184, + "step": 10240 + }, + { + "epoch": 0.6705920837422309, + "grad_norm": 0.8490195870399475, + "learning_rate": 9.998683704629739e-05, + "loss": 0.1236, + "step": 10250 + }, + { + "epoch": 0.6712463199214916, + "grad_norm": 0.8913207650184631, + "learning_rate": 9.99866254360738e-05, + "loss": 0.1195, + "step": 10260 + }, + { + "epoch": 0.6719005561007524, + "grad_norm": 0.9286946654319763, + "learning_rate": 9.998641213867051e-05, + "loss": 0.1168, + "step": 10270 + }, + { + "epoch": 0.6725547922800131, + "grad_norm": 0.9744524359703064, + "learning_rate": 9.998619715409471e-05, + "loss": 0.1134, + "step": 10280 + }, + { + "epoch": 0.6732090284592738, + "grad_norm": 0.911208987236023, + "learning_rate": 9.998598048235369e-05, + "loss": 0.1115, + "step": 10290 + }, + { + "epoch": 0.6738632646385345, + "grad_norm": 1.0315773487091064, + "learning_rate": 9.99857621234547e-05, + "loss": 0.1189, + "step": 10300 + }, + { + "epoch": 0.6745175008177953, + "grad_norm": 1.1607714891433716, + "learning_rate": 9.998554207740517e-05, + "loss": 0.1238, + "step": 10310 + }, + { + "epoch": 0.6751717369970559, + "grad_norm": 0.7840601801872253, + "learning_rate": 9.99853203442125e-05, + "loss": 0.1196, + "step": 10320 + }, + { + "epoch": 0.6758259731763167, + "grad_norm": 1.062690019607544, + "learning_rate": 9.998509692388416e-05, + "loss": 0.1227, + "step": 10330 + }, + { + "epoch": 0.6764802093555774, + "grad_norm": 1.104194164276123, + "learning_rate": 9.998487181642772e-05, + "loss": 0.1231, + "step": 10340 + }, + { + "epoch": 0.677134445534838, + "grad_norm": 1.0216797590255737, + "learning_rate": 9.998464502185076e-05, + "loss": 0.118, + "step": 10350 + }, + { + "epoch": 0.6777886817140988, + "grad_norm": 0.9924758076667786, + "learning_rate": 9.998441654016095e-05, + "loss": 0.1245, + "step": 10360 + }, + { + "epoch": 0.6784429178933595, + "grad_norm": 0.9383712410926819, + "learning_rate": 9.9984186371366e-05, + "loss": 0.116, + "step": 10370 + }, + { + "epoch": 0.6790971540726202, + "grad_norm": 0.8061214089393616, + "learning_rate": 9.998395451547367e-05, + "loss": 0.1172, + "step": 10380 + }, + { + "epoch": 0.6797513902518809, + "grad_norm": 0.9712622761726379, + "learning_rate": 9.998372097249177e-05, + "loss": 0.1206, + "step": 10390 + }, + { + "epoch": 0.6804056264311417, + "grad_norm": 0.8583802580833435, + "learning_rate": 9.998348574242821e-05, + "loss": 0.121, + "step": 10400 + }, + { + "epoch": 0.6810598626104023, + "grad_norm": 0.8582515120506287, + "learning_rate": 9.99832488252909e-05, + "loss": 0.1189, + "step": 10410 + }, + { + "epoch": 0.681714098789663, + "grad_norm": 0.9043430685997009, + "learning_rate": 9.998301022108789e-05, + "loss": 0.1108, + "step": 10420 + }, + { + "epoch": 0.6823683349689238, + "grad_norm": 0.9330876469612122, + "learning_rate": 9.998276992982717e-05, + "loss": 0.1178, + "step": 10430 + }, + { + "epoch": 0.6830225711481845, + "grad_norm": 0.9934483170509338, + "learning_rate": 9.99825279515169e-05, + "loss": 0.1206, + "step": 10440 + }, + { + "epoch": 0.6836768073274452, + "grad_norm": 0.8056890964508057, + "learning_rate": 9.998228428616523e-05, + "loss": 0.1283, + "step": 10450 + }, + { + "epoch": 0.6843310435067059, + "grad_norm": 0.9143165349960327, + "learning_rate": 9.998203893378037e-05, + "loss": 0.1322, + "step": 10460 + }, + { + "epoch": 0.6849852796859667, + "grad_norm": 0.9717006683349609, + "learning_rate": 9.998179189437062e-05, + "loss": 0.1274, + "step": 10470 + }, + { + "epoch": 0.6856395158652273, + "grad_norm": 1.0283546447753906, + "learning_rate": 9.99815431679443e-05, + "loss": 0.1193, + "step": 10480 + }, + { + "epoch": 0.6862937520444881, + "grad_norm": 0.9242172241210938, + "learning_rate": 9.998129275450983e-05, + "loss": 0.1284, + "step": 10490 + }, + { + "epoch": 0.6869479882237488, + "grad_norm": 0.8820136785507202, + "learning_rate": 9.998104065407565e-05, + "loss": 0.124, + "step": 10500 + }, + { + "epoch": 0.6876022244030094, + "grad_norm": 0.8272663354873657, + "learning_rate": 9.998078686665026e-05, + "loss": 0.116, + "step": 10510 + }, + { + "epoch": 0.6882564605822702, + "grad_norm": 0.9149497151374817, + "learning_rate": 9.998053139224224e-05, + "loss": 0.1145, + "step": 10520 + }, + { + "epoch": 0.6889106967615309, + "grad_norm": 0.8796846866607666, + "learning_rate": 9.99802742308602e-05, + "loss": 0.1186, + "step": 10530 + }, + { + "epoch": 0.6895649329407916, + "grad_norm": 1.0413655042648315, + "learning_rate": 9.998001538251282e-05, + "loss": 0.1149, + "step": 10540 + }, + { + "epoch": 0.6902191691200523, + "grad_norm": 0.9910515546798706, + "learning_rate": 9.997975484720887e-05, + "loss": 0.1258, + "step": 10550 + }, + { + "epoch": 0.6908734052993131, + "grad_norm": 0.9942857623100281, + "learning_rate": 9.997949262495709e-05, + "loss": 0.1236, + "step": 10560 + }, + { + "epoch": 0.6915276414785738, + "grad_norm": 0.8903499841690063, + "learning_rate": 9.997922871576638e-05, + "loss": 0.113, + "step": 10570 + }, + { + "epoch": 0.6921818776578345, + "grad_norm": 0.9105699062347412, + "learning_rate": 9.997896311964561e-05, + "loss": 0.1276, + "step": 10580 + }, + { + "epoch": 0.6928361138370952, + "grad_norm": 1.2962309122085571, + "learning_rate": 9.997869583660375e-05, + "loss": 0.1285, + "step": 10590 + }, + { + "epoch": 0.693490350016356, + "grad_norm": 1.1062681674957275, + "learning_rate": 9.997842686664985e-05, + "loss": 0.117, + "step": 10600 + }, + { + "epoch": 0.6941445861956166, + "grad_norm": 0.9840047955513, + "learning_rate": 9.997815620979297e-05, + "loss": 0.1329, + "step": 10610 + }, + { + "epoch": 0.6947988223748773, + "grad_norm": 0.7675462961196899, + "learning_rate": 9.997788386604224e-05, + "loss": 0.1115, + "step": 10620 + }, + { + "epoch": 0.6954530585541381, + "grad_norm": 0.9856576323509216, + "learning_rate": 9.997760983540686e-05, + "loss": 0.108, + "step": 10630 + }, + { + "epoch": 0.6961072947333987, + "grad_norm": 1.1004915237426758, + "learning_rate": 9.997733411789607e-05, + "loss": 0.1158, + "step": 10640 + }, + { + "epoch": 0.6967615309126595, + "grad_norm": 0.9109641313552856, + "learning_rate": 9.99770567135192e-05, + "loss": 0.1198, + "step": 10650 + }, + { + "epoch": 0.6974157670919202, + "grad_norm": 0.8749265074729919, + "learning_rate": 9.997677762228558e-05, + "loss": 0.1166, + "step": 10660 + }, + { + "epoch": 0.6980700032711809, + "grad_norm": 0.9863060712814331, + "learning_rate": 9.997649684420465e-05, + "loss": 0.1151, + "step": 10670 + }, + { + "epoch": 0.6987242394504416, + "grad_norm": 0.9862884879112244, + "learning_rate": 9.997621437928588e-05, + "loss": 0.1199, + "step": 10680 + }, + { + "epoch": 0.6993784756297023, + "grad_norm": 0.8637502193450928, + "learning_rate": 9.997593022753881e-05, + "loss": 0.118, + "step": 10690 + }, + { + "epoch": 0.700032711808963, + "grad_norm": 0.8936694860458374, + "learning_rate": 9.997564438897304e-05, + "loss": 0.1226, + "step": 10700 + }, + { + "epoch": 0.7006869479882237, + "grad_norm": 0.8638026714324951, + "learning_rate": 9.997535686359819e-05, + "loss": 0.1084, + "step": 10710 + }, + { + "epoch": 0.7013411841674845, + "grad_norm": 0.8587387800216675, + "learning_rate": 9.997506765142399e-05, + "loss": 0.1229, + "step": 10720 + }, + { + "epoch": 0.7019954203467452, + "grad_norm": 0.894414484500885, + "learning_rate": 9.997477675246019e-05, + "loss": 0.1046, + "step": 10730 + }, + { + "epoch": 0.7026496565260059, + "grad_norm": 0.8560695648193359, + "learning_rate": 9.997448416671661e-05, + "loss": 0.1083, + "step": 10740 + }, + { + "epoch": 0.7033038927052666, + "grad_norm": 0.9901031851768494, + "learning_rate": 9.997418989420313e-05, + "loss": 0.116, + "step": 10750 + }, + { + "epoch": 0.7039581288845274, + "grad_norm": 0.96174556016922, + "learning_rate": 9.997389393492966e-05, + "loss": 0.1185, + "step": 10760 + }, + { + "epoch": 0.704612365063788, + "grad_norm": 0.802387535572052, + "learning_rate": 9.997359628890623e-05, + "loss": 0.1202, + "step": 10770 + }, + { + "epoch": 0.7052666012430487, + "grad_norm": 0.7916472554206848, + "learning_rate": 9.997329695614286e-05, + "loss": 0.1103, + "step": 10780 + }, + { + "epoch": 0.7059208374223095, + "grad_norm": 1.1118965148925781, + "learning_rate": 9.997299593664966e-05, + "loss": 0.1263, + "step": 10790 + }, + { + "epoch": 0.7065750736015701, + "grad_norm": 1.2766551971435547, + "learning_rate": 9.997269323043678e-05, + "loss": 0.1283, + "step": 10800 + }, + { + "epoch": 0.7072293097808309, + "grad_norm": 0.9842929244041443, + "learning_rate": 9.997238883751446e-05, + "loss": 0.1167, + "step": 10810 + }, + { + "epoch": 0.7078835459600916, + "grad_norm": 0.9710572957992554, + "learning_rate": 9.997208275789294e-05, + "loss": 0.1074, + "step": 10820 + }, + { + "epoch": 0.7085377821393523, + "grad_norm": 1.014204978942871, + "learning_rate": 9.997177499158258e-05, + "loss": 0.1155, + "step": 10830 + }, + { + "epoch": 0.709192018318613, + "grad_norm": 0.851264238357544, + "learning_rate": 9.997146553859375e-05, + "loss": 0.1154, + "step": 10840 + }, + { + "epoch": 0.7098462544978738, + "grad_norm": 0.9606887102127075, + "learning_rate": 9.997115439893692e-05, + "loss": 0.1306, + "step": 10850 + }, + { + "epoch": 0.7105004906771345, + "grad_norm": 0.9389902949333191, + "learning_rate": 9.997084157262256e-05, + "loss": 0.1104, + "step": 10860 + }, + { + "epoch": 0.7111547268563951, + "grad_norm": 0.78615403175354, + "learning_rate": 9.997052705966126e-05, + "loss": 0.1112, + "step": 10870 + }, + { + "epoch": 0.7118089630356559, + "grad_norm": 0.884412407875061, + "learning_rate": 9.99702108600636e-05, + "loss": 0.1246, + "step": 10880 + }, + { + "epoch": 0.7124631992149166, + "grad_norm": 0.8281691074371338, + "learning_rate": 9.996989297384029e-05, + "loss": 0.1097, + "step": 10890 + }, + { + "epoch": 0.7131174353941773, + "grad_norm": 1.054056167602539, + "learning_rate": 9.996957340100203e-05, + "loss": 0.1206, + "step": 10900 + }, + { + "epoch": 0.713771671573438, + "grad_norm": 0.8140998482704163, + "learning_rate": 9.996925214155962e-05, + "loss": 0.1207, + "step": 10910 + }, + { + "epoch": 0.7144259077526988, + "grad_norm": 0.8529103398323059, + "learning_rate": 9.99689291955239e-05, + "loss": 0.1118, + "step": 10920 + }, + { + "epoch": 0.7150801439319594, + "grad_norm": 1.020617127418518, + "learning_rate": 9.996860456290576e-05, + "loss": 0.1215, + "step": 10930 + }, + { + "epoch": 0.7157343801112201, + "grad_norm": 0.8196330070495605, + "learning_rate": 9.996827824371618e-05, + "loss": 0.1094, + "step": 10940 + }, + { + "epoch": 0.7163886162904809, + "grad_norm": 0.9389585256576538, + "learning_rate": 9.996795023796617e-05, + "loss": 0.1175, + "step": 10950 + }, + { + "epoch": 0.7170428524697415, + "grad_norm": 0.9360471367835999, + "learning_rate": 9.996762054566679e-05, + "loss": 0.1138, + "step": 10960 + }, + { + "epoch": 0.7176970886490023, + "grad_norm": 0.9107570052146912, + "learning_rate": 9.996728916682915e-05, + "loss": 0.1124, + "step": 10970 + }, + { + "epoch": 0.718351324828263, + "grad_norm": 0.8106933236122131, + "learning_rate": 9.996695610146449e-05, + "loss": 0.124, + "step": 10980 + }, + { + "epoch": 0.7190055610075237, + "grad_norm": 1.0682817697525024, + "learning_rate": 9.9966621349584e-05, + "loss": 0.1277, + "step": 10990 + }, + { + "epoch": 0.7196597971867844, + "grad_norm": 1.0235021114349365, + "learning_rate": 9.9966284911199e-05, + "loss": 0.1229, + "step": 11000 + }, + { + "epoch": 0.7203140333660452, + "grad_norm": 1.0049972534179688, + "learning_rate": 9.996594678632085e-05, + "loss": 0.1182, + "step": 11010 + }, + { + "epoch": 0.7209682695453059, + "grad_norm": 0.8805012106895447, + "learning_rate": 9.996560697496094e-05, + "loss": 0.116, + "step": 11020 + }, + { + "epoch": 0.7216225057245665, + "grad_norm": 0.8662134408950806, + "learning_rate": 9.996526547713077e-05, + "loss": 0.117, + "step": 11030 + }, + { + "epoch": 0.7222767419038273, + "grad_norm": 1.0079435110092163, + "learning_rate": 9.996492229284185e-05, + "loss": 0.1213, + "step": 11040 + }, + { + "epoch": 0.722930978083088, + "grad_norm": 1.034039855003357, + "learning_rate": 9.996457742210576e-05, + "loss": 0.1116, + "step": 11050 + }, + { + "epoch": 0.7235852142623487, + "grad_norm": 0.9001430869102478, + "learning_rate": 9.996423086493414e-05, + "loss": 0.1155, + "step": 11060 + }, + { + "epoch": 0.7242394504416094, + "grad_norm": 0.8581667542457581, + "learning_rate": 9.996388262133869e-05, + "loss": 0.1147, + "step": 11070 + }, + { + "epoch": 0.7248936866208702, + "grad_norm": 0.9072811007499695, + "learning_rate": 9.996353269133118e-05, + "loss": 0.1164, + "step": 11080 + }, + { + "epoch": 0.7255479228001308, + "grad_norm": 0.7761887311935425, + "learning_rate": 9.99631810749234e-05, + "loss": 0.1107, + "step": 11090 + }, + { + "epoch": 0.7262021589793916, + "grad_norm": 1.0262295007705688, + "learning_rate": 9.996282777212723e-05, + "loss": 0.1209, + "step": 11100 + }, + { + "epoch": 0.7268563951586523, + "grad_norm": 0.9890956282615662, + "learning_rate": 9.996247278295458e-05, + "loss": 0.1215, + "step": 11110 + }, + { + "epoch": 0.7275106313379129, + "grad_norm": 0.8441636562347412, + "learning_rate": 9.996211610741745e-05, + "loss": 0.1076, + "step": 11120 + }, + { + "epoch": 0.7281648675171737, + "grad_norm": 0.9701690673828125, + "learning_rate": 9.996175774552788e-05, + "loss": 0.1305, + "step": 11130 + }, + { + "epoch": 0.7288191036964344, + "grad_norm": 0.8142655491828918, + "learning_rate": 9.996139769729795e-05, + "loss": 0.1164, + "step": 11140 + }, + { + "epoch": 0.7294733398756952, + "grad_norm": 1.1642175912857056, + "learning_rate": 9.99610359627398e-05, + "loss": 0.1196, + "step": 11150 + }, + { + "epoch": 0.7301275760549558, + "grad_norm": 0.864878237247467, + "learning_rate": 9.996067254186568e-05, + "loss": 0.1156, + "step": 11160 + }, + { + "epoch": 0.7307818122342166, + "grad_norm": 1.0916894674301147, + "learning_rate": 9.996030743468783e-05, + "loss": 0.1235, + "step": 11170 + }, + { + "epoch": 0.7314360484134773, + "grad_norm": 0.9421660304069519, + "learning_rate": 9.995994064121859e-05, + "loss": 0.116, + "step": 11180 + }, + { + "epoch": 0.732090284592738, + "grad_norm": 0.8942557573318481, + "learning_rate": 9.995957216147031e-05, + "loss": 0.1136, + "step": 11190 + }, + { + "epoch": 0.7327445207719987, + "grad_norm": 0.8067488670349121, + "learning_rate": 9.995920199545546e-05, + "loss": 0.112, + "step": 11200 + }, + { + "epoch": 0.7333987569512594, + "grad_norm": 0.9836673140525818, + "learning_rate": 9.99588301431865e-05, + "loss": 0.1128, + "step": 11210 + }, + { + "epoch": 0.7340529931305201, + "grad_norm": 0.7709094285964966, + "learning_rate": 9.995845660467602e-05, + "loss": 0.1205, + "step": 11220 + }, + { + "epoch": 0.7347072293097808, + "grad_norm": 0.7916396856307983, + "learning_rate": 9.99580813799366e-05, + "loss": 0.1201, + "step": 11230 + }, + { + "epoch": 0.7353614654890416, + "grad_norm": 1.0031894445419312, + "learning_rate": 9.995770446898092e-05, + "loss": 0.1181, + "step": 11240 + }, + { + "epoch": 0.7360157016683022, + "grad_norm": 1.0334042310714722, + "learning_rate": 9.995732587182168e-05, + "loss": 0.1191, + "step": 11250 + }, + { + "epoch": 0.736669937847563, + "grad_norm": 0.9869486689567566, + "learning_rate": 9.995694558847169e-05, + "loss": 0.1129, + "step": 11260 + }, + { + "epoch": 0.7373241740268237, + "grad_norm": 0.7451587915420532, + "learning_rate": 9.995656361894377e-05, + "loss": 0.1216, + "step": 11270 + }, + { + "epoch": 0.7379784102060843, + "grad_norm": 0.9288752675056458, + "learning_rate": 9.99561799632508e-05, + "loss": 0.1257, + "step": 11280 + }, + { + "epoch": 0.7386326463853451, + "grad_norm": 0.972141683101654, + "learning_rate": 9.995579462140574e-05, + "loss": 0.1124, + "step": 11290 + }, + { + "epoch": 0.7392868825646058, + "grad_norm": 1.0879883766174316, + "learning_rate": 9.995540759342161e-05, + "loss": 0.1216, + "step": 11300 + }, + { + "epoch": 0.7399411187438666, + "grad_norm": 0.9992396235466003, + "learning_rate": 9.995501887931146e-05, + "loss": 0.117, + "step": 11310 + }, + { + "epoch": 0.7405953549231272, + "grad_norm": 0.951146125793457, + "learning_rate": 9.99546284790884e-05, + "loss": 0.106, + "step": 11320 + }, + { + "epoch": 0.741249591102388, + "grad_norm": 0.9258811473846436, + "learning_rate": 9.995423639276562e-05, + "loss": 0.1253, + "step": 11330 + }, + { + "epoch": 0.7419038272816487, + "grad_norm": 0.8994291424751282, + "learning_rate": 9.995384262035637e-05, + "loss": 0.1141, + "step": 11340 + }, + { + "epoch": 0.7425580634609094, + "grad_norm": 1.0479780435562134, + "learning_rate": 9.99534471618739e-05, + "loss": 0.1228, + "step": 11350 + }, + { + "epoch": 0.7432122996401701, + "grad_norm": 0.8784400820732117, + "learning_rate": 9.99530500173316e-05, + "loss": 0.1111, + "step": 11360 + }, + { + "epoch": 0.7438665358194309, + "grad_norm": 0.9299617409706116, + "learning_rate": 9.995265118674284e-05, + "loss": 0.1127, + "step": 11370 + }, + { + "epoch": 0.7445207719986915, + "grad_norm": 0.9754354357719421, + "learning_rate": 9.99522506701211e-05, + "loss": 0.1186, + "step": 11380 + }, + { + "epoch": 0.7451750081779522, + "grad_norm": 1.0314409732818604, + "learning_rate": 9.99518484674799e-05, + "loss": 0.1145, + "step": 11390 + }, + { + "epoch": 0.745829244357213, + "grad_norm": 0.7628955841064453, + "learning_rate": 9.99514445788328e-05, + "loss": 0.1124, + "step": 11400 + }, + { + "epoch": 0.7464834805364736, + "grad_norm": 0.9075015783309937, + "learning_rate": 9.995103900419348e-05, + "loss": 0.1159, + "step": 11410 + }, + { + "epoch": 0.7471377167157344, + "grad_norm": 1.0489757061004639, + "learning_rate": 9.995063174357555e-05, + "loss": 0.1226, + "step": 11420 + }, + { + "epoch": 0.7477919528949951, + "grad_norm": 0.9378239512443542, + "learning_rate": 9.995022279699281e-05, + "loss": 0.1209, + "step": 11430 + }, + { + "epoch": 0.7484461890742558, + "grad_norm": 0.8588927388191223, + "learning_rate": 9.994981216445905e-05, + "loss": 0.1204, + "step": 11440 + }, + { + "epoch": 0.7491004252535165, + "grad_norm": 0.760901689529419, + "learning_rate": 9.994939984598813e-05, + "loss": 0.111, + "step": 11450 + }, + { + "epoch": 0.7497546614327772, + "grad_norm": 0.78853440284729, + "learning_rate": 9.994898584159397e-05, + "loss": 0.114, + "step": 11460 + }, + { + "epoch": 0.750408897612038, + "grad_norm": 0.9857150316238403, + "learning_rate": 9.994857015129056e-05, + "loss": 0.1131, + "step": 11470 + }, + { + "epoch": 0.7510631337912986, + "grad_norm": 0.8129025101661682, + "learning_rate": 9.994815277509188e-05, + "loss": 0.1038, + "step": 11480 + }, + { + "epoch": 0.7517173699705594, + "grad_norm": 0.8603448867797852, + "learning_rate": 9.994773371301207e-05, + "loss": 0.119, + "step": 11490 + }, + { + "epoch": 0.7523716061498201, + "grad_norm": 0.797516942024231, + "learning_rate": 9.994731296506525e-05, + "loss": 0.1257, + "step": 11500 + }, + { + "epoch": 0.7530258423290808, + "grad_norm": 0.7435304522514343, + "learning_rate": 9.994689053126564e-05, + "loss": 0.1089, + "step": 11510 + }, + { + "epoch": 0.7536800785083415, + "grad_norm": 0.8212321996688843, + "learning_rate": 9.994646641162745e-05, + "loss": 0.1115, + "step": 11520 + }, + { + "epoch": 0.7543343146876023, + "grad_norm": 0.8718414306640625, + "learning_rate": 9.994604060616506e-05, + "loss": 0.1215, + "step": 11530 + }, + { + "epoch": 0.7549885508668629, + "grad_norm": 0.8560036420822144, + "learning_rate": 9.99456131148928e-05, + "loss": 0.1098, + "step": 11540 + }, + { + "epoch": 0.7556427870461236, + "grad_norm": 1.0013700723648071, + "learning_rate": 9.994518393782513e-05, + "loss": 0.1199, + "step": 11550 + }, + { + "epoch": 0.7562970232253844, + "grad_norm": 0.8606430292129517, + "learning_rate": 9.994475307497649e-05, + "loss": 0.1022, + "step": 11560 + }, + { + "epoch": 0.756951259404645, + "grad_norm": 0.9482471942901611, + "learning_rate": 9.994432052636145e-05, + "loss": 0.1203, + "step": 11570 + }, + { + "epoch": 0.7576054955839058, + "grad_norm": 1.0482877492904663, + "learning_rate": 9.994388629199463e-05, + "loss": 0.1214, + "step": 11580 + }, + { + "epoch": 0.7582597317631665, + "grad_norm": 0.8290185928344727, + "learning_rate": 9.994345037189063e-05, + "loss": 0.1192, + "step": 11590 + }, + { + "epoch": 0.7589139679424273, + "grad_norm": 0.8499178886413574, + "learning_rate": 9.994301276606424e-05, + "loss": 0.1183, + "step": 11600 + }, + { + "epoch": 0.7595682041216879, + "grad_norm": 0.9083896279335022, + "learning_rate": 9.994257347453015e-05, + "loss": 0.1098, + "step": 11610 + }, + { + "epoch": 0.7602224403009487, + "grad_norm": 0.9857131242752075, + "learning_rate": 9.994213249730325e-05, + "loss": 0.1212, + "step": 11620 + }, + { + "epoch": 0.7608766764802094, + "grad_norm": 0.930366575717926, + "learning_rate": 9.99416898343984e-05, + "loss": 0.1173, + "step": 11630 + }, + { + "epoch": 0.76153091265947, + "grad_norm": 0.9344758987426758, + "learning_rate": 9.994124548583053e-05, + "loss": 0.1246, + "step": 11640 + }, + { + "epoch": 0.7621851488387308, + "grad_norm": 0.8181567192077637, + "learning_rate": 9.994079945161466e-05, + "loss": 0.1207, + "step": 11650 + }, + { + "epoch": 0.7628393850179915, + "grad_norm": 0.7763569951057434, + "learning_rate": 9.994035173176582e-05, + "loss": 0.1251, + "step": 11660 + }, + { + "epoch": 0.7634936211972522, + "grad_norm": 1.0689425468444824, + "learning_rate": 9.993990232629915e-05, + "loss": 0.136, + "step": 11670 + }, + { + "epoch": 0.7641478573765129, + "grad_norm": 0.9611626267433167, + "learning_rate": 9.993945123522978e-05, + "loss": 0.1179, + "step": 11680 + }, + { + "epoch": 0.7648020935557737, + "grad_norm": 0.9553081393241882, + "learning_rate": 9.9938998458573e-05, + "loss": 0.1153, + "step": 11690 + }, + { + "epoch": 0.7654563297350343, + "grad_norm": 0.9695801138877869, + "learning_rate": 9.993854399634402e-05, + "loss": 0.1305, + "step": 11700 + }, + { + "epoch": 0.766110565914295, + "grad_norm": 1.1359174251556396, + "learning_rate": 9.993808784855823e-05, + "loss": 0.1135, + "step": 11710 + }, + { + "epoch": 0.7667648020935558, + "grad_norm": 0.8524360060691833, + "learning_rate": 9.9937630015231e-05, + "loss": 0.1204, + "step": 11720 + }, + { + "epoch": 0.7674190382728164, + "grad_norm": 1.0540283918380737, + "learning_rate": 9.993717049637779e-05, + "loss": 0.1255, + "step": 11730 + }, + { + "epoch": 0.7680732744520772, + "grad_norm": 0.8269158601760864, + "learning_rate": 9.99367092920141e-05, + "loss": 0.1145, + "step": 11740 + }, + { + "epoch": 0.7687275106313379, + "grad_norm": 0.9928538203239441, + "learning_rate": 9.993624640215552e-05, + "loss": 0.1245, + "step": 11750 + }, + { + "epoch": 0.7693817468105987, + "grad_norm": 0.9222886562347412, + "learning_rate": 9.993578182681767e-05, + "loss": 0.1098, + "step": 11760 + }, + { + "epoch": 0.7700359829898593, + "grad_norm": 1.1276291608810425, + "learning_rate": 9.993531556601621e-05, + "loss": 0.1114, + "step": 11770 + }, + { + "epoch": 0.7706902191691201, + "grad_norm": 0.9698314070701599, + "learning_rate": 9.993484761976688e-05, + "loss": 0.1103, + "step": 11780 + }, + { + "epoch": 0.7713444553483808, + "grad_norm": 1.0448344945907593, + "learning_rate": 9.993437798808549e-05, + "loss": 0.1034, + "step": 11790 + }, + { + "epoch": 0.7719986915276414, + "grad_norm": 0.7689130902290344, + "learning_rate": 9.99339066709879e-05, + "loss": 0.112, + "step": 11800 + }, + { + "epoch": 0.7726529277069022, + "grad_norm": 0.8706762790679932, + "learning_rate": 9.993343366849e-05, + "loss": 0.1238, + "step": 11810 + }, + { + "epoch": 0.7733071638861629, + "grad_norm": 0.9579262733459473, + "learning_rate": 9.993295898060775e-05, + "loss": 0.1092, + "step": 11820 + }, + { + "epoch": 0.7739614000654236, + "grad_norm": 0.9516409039497375, + "learning_rate": 9.993248260735717e-05, + "loss": 0.1148, + "step": 11830 + }, + { + "epoch": 0.7746156362446843, + "grad_norm": 0.7520780563354492, + "learning_rate": 9.993200454875436e-05, + "loss": 0.1155, + "step": 11840 + }, + { + "epoch": 0.7752698724239451, + "grad_norm": 0.8823644518852234, + "learning_rate": 9.993152480481545e-05, + "loss": 0.1093, + "step": 11850 + }, + { + "epoch": 0.7759241086032057, + "grad_norm": 1.0025229454040527, + "learning_rate": 9.993104337555663e-05, + "loss": 0.1207, + "step": 11860 + }, + { + "epoch": 0.7765783447824665, + "grad_norm": 0.9853441715240479, + "learning_rate": 9.993056026099415e-05, + "loss": 0.1264, + "step": 11870 + }, + { + "epoch": 0.7772325809617272, + "grad_norm": 0.8879414200782776, + "learning_rate": 9.99300754611443e-05, + "loss": 0.1252, + "step": 11880 + }, + { + "epoch": 0.777886817140988, + "grad_norm": 0.8338192105293274, + "learning_rate": 9.992958897602344e-05, + "loss": 0.1187, + "step": 11890 + }, + { + "epoch": 0.7785410533202486, + "grad_norm": 0.872117280960083, + "learning_rate": 9.992910080564803e-05, + "loss": 0.1105, + "step": 11900 + }, + { + "epoch": 0.7791952894995093, + "grad_norm": 0.9396425485610962, + "learning_rate": 9.992861095003454e-05, + "loss": 0.1131, + "step": 11910 + }, + { + "epoch": 0.7798495256787701, + "grad_norm": 1.1532872915267944, + "learning_rate": 9.992811940919946e-05, + "loss": 0.1247, + "step": 11920 + }, + { + "epoch": 0.7805037618580307, + "grad_norm": 1.065436601638794, + "learning_rate": 9.992762618315942e-05, + "loss": 0.127, + "step": 11930 + }, + { + "epoch": 0.7811579980372915, + "grad_norm": 0.9578902721405029, + "learning_rate": 9.992713127193106e-05, + "loss": 0.1106, + "step": 11940 + }, + { + "epoch": 0.7818122342165522, + "grad_norm": 0.8533861637115479, + "learning_rate": 9.992663467553108e-05, + "loss": 0.1244, + "step": 11950 + }, + { + "epoch": 0.7824664703958129, + "grad_norm": 0.9745835661888123, + "learning_rate": 9.992613639397624e-05, + "loss": 0.1104, + "step": 11960 + }, + { + "epoch": 0.7831207065750736, + "grad_norm": 0.8613539338111877, + "learning_rate": 9.992563642728335e-05, + "loss": 0.1044, + "step": 11970 + }, + { + "epoch": 0.7837749427543343, + "grad_norm": 1.0119600296020508, + "learning_rate": 9.992513477546931e-05, + "loss": 0.1181, + "step": 11980 + }, + { + "epoch": 0.784429178933595, + "grad_norm": 0.856393039226532, + "learning_rate": 9.992463143855102e-05, + "loss": 0.1172, + "step": 11990 + }, + { + "epoch": 0.7850834151128557, + "grad_norm": 0.8078387379646301, + "learning_rate": 9.992412641654551e-05, + "loss": 0.1332, + "step": 12000 + }, + { + "epoch": 0.7857376512921165, + "grad_norm": 0.818047046661377, + "learning_rate": 9.99236197094698e-05, + "loss": 0.1156, + "step": 12010 + }, + { + "epoch": 0.7863918874713771, + "grad_norm": 0.8972421288490295, + "learning_rate": 9.992311131734098e-05, + "loss": 0.1162, + "step": 12020 + }, + { + "epoch": 0.7870461236506379, + "grad_norm": 0.9026692509651184, + "learning_rate": 9.992260124017623e-05, + "loss": 0.1239, + "step": 12030 + }, + { + "epoch": 0.7877003598298986, + "grad_norm": 0.8012493252754211, + "learning_rate": 9.992208947799276e-05, + "loss": 0.1224, + "step": 12040 + }, + { + "epoch": 0.7883545960091594, + "grad_norm": 0.7508116960525513, + "learning_rate": 9.992157603080785e-05, + "loss": 0.1145, + "step": 12050 + }, + { + "epoch": 0.78900883218842, + "grad_norm": 0.9431008100509644, + "learning_rate": 9.992106089863883e-05, + "loss": 0.1209, + "step": 12060 + }, + { + "epoch": 0.7896630683676807, + "grad_norm": 0.9398839473724365, + "learning_rate": 9.992054408150307e-05, + "loss": 0.1195, + "step": 12070 + }, + { + "epoch": 0.7903173045469415, + "grad_norm": 0.8729955554008484, + "learning_rate": 9.992002557941804e-05, + "loss": 0.1151, + "step": 12080 + }, + { + "epoch": 0.7909715407262021, + "grad_norm": 0.8589168787002563, + "learning_rate": 9.991950539240122e-05, + "loss": 0.1228, + "step": 12090 + }, + { + "epoch": 0.7916257769054629, + "grad_norm": 0.7919473052024841, + "learning_rate": 9.991898352047016e-05, + "loss": 0.1113, + "step": 12100 + }, + { + "epoch": 0.7922800130847236, + "grad_norm": 0.8259612321853638, + "learning_rate": 9.99184599636425e-05, + "loss": 0.1143, + "step": 12110 + }, + { + "epoch": 0.7929342492639843, + "grad_norm": 0.9801688194274902, + "learning_rate": 9.99179347219359e-05, + "loss": 0.1148, + "step": 12120 + }, + { + "epoch": 0.793588485443245, + "grad_norm": 0.9544298648834229, + "learning_rate": 9.991740779536808e-05, + "loss": 0.1276, + "step": 12130 + }, + { + "epoch": 0.7942427216225058, + "grad_norm": 0.946922242641449, + "learning_rate": 9.991687918395686e-05, + "loss": 0.1174, + "step": 12140 + }, + { + "epoch": 0.7948969578017664, + "grad_norm": 1.27000892162323, + "learning_rate": 9.991634888772003e-05, + "loss": 0.1163, + "step": 12150 + }, + { + "epoch": 0.7955511939810271, + "grad_norm": 0.7371228933334351, + "learning_rate": 9.991581690667553e-05, + "loss": 0.117, + "step": 12160 + }, + { + "epoch": 0.7962054301602879, + "grad_norm": 0.8912835717201233, + "learning_rate": 9.99152832408413e-05, + "loss": 0.1134, + "step": 12170 + }, + { + "epoch": 0.7968596663395485, + "grad_norm": 0.8078799843788147, + "learning_rate": 9.991474789023534e-05, + "loss": 0.1217, + "step": 12180 + }, + { + "epoch": 0.7975139025188093, + "grad_norm": 0.8640681505203247, + "learning_rate": 9.991421085487573e-05, + "loss": 0.1029, + "step": 12190 + }, + { + "epoch": 0.79816813869807, + "grad_norm": 1.075920820236206, + "learning_rate": 9.991367213478062e-05, + "loss": 0.1155, + "step": 12200 + }, + { + "epoch": 0.7988223748773308, + "grad_norm": 1.051973581314087, + "learning_rate": 9.991313172996815e-05, + "loss": 0.1233, + "step": 12210 + }, + { + "epoch": 0.7994766110565914, + "grad_norm": 0.9649173617362976, + "learning_rate": 9.991258964045659e-05, + "loss": 0.1247, + "step": 12220 + }, + { + "epoch": 0.8001308472358521, + "grad_norm": 0.9157887697219849, + "learning_rate": 9.991204586626424e-05, + "loss": 0.1288, + "step": 12230 + }, + { + "epoch": 0.8007850834151129, + "grad_norm": 0.8497806787490845, + "learning_rate": 9.991150040740944e-05, + "loss": 0.1091, + "step": 12240 + }, + { + "epoch": 0.8014393195943735, + "grad_norm": 1.007943034172058, + "learning_rate": 9.99109532639106e-05, + "loss": 0.1104, + "step": 12250 + }, + { + "epoch": 0.8020935557736343, + "grad_norm": 0.8589887022972107, + "learning_rate": 9.991040443578618e-05, + "loss": 0.1219, + "step": 12260 + }, + { + "epoch": 0.802747791952895, + "grad_norm": 1.0197535753250122, + "learning_rate": 9.990985392305473e-05, + "loss": 0.1156, + "step": 12270 + }, + { + "epoch": 0.8034020281321557, + "grad_norm": 0.8951653242111206, + "learning_rate": 9.99093017257348e-05, + "loss": 0.111, + "step": 12280 + }, + { + "epoch": 0.8040562643114164, + "grad_norm": 1.0193089246749878, + "learning_rate": 9.990874784384506e-05, + "loss": 0.1139, + "step": 12290 + }, + { + "epoch": 0.8047105004906772, + "grad_norm": 0.8544113636016846, + "learning_rate": 9.990819227740418e-05, + "loss": 0.1092, + "step": 12300 + }, + { + "epoch": 0.8053647366699378, + "grad_norm": 0.9851740598678589, + "learning_rate": 9.990763502643094e-05, + "loss": 0.1163, + "step": 12310 + }, + { + "epoch": 0.8060189728491985, + "grad_norm": 1.2490851879119873, + "learning_rate": 9.990707609094412e-05, + "loss": 0.1177, + "step": 12320 + }, + { + "epoch": 0.8066732090284593, + "grad_norm": 0.8917890787124634, + "learning_rate": 9.990651547096259e-05, + "loss": 0.1218, + "step": 12330 + }, + { + "epoch": 0.80732744520772, + "grad_norm": 1.0979591608047485, + "learning_rate": 9.990595316650528e-05, + "loss": 0.114, + "step": 12340 + }, + { + "epoch": 0.8079816813869807, + "grad_norm": 0.89876788854599, + "learning_rate": 9.990538917759117e-05, + "loss": 0.1308, + "step": 12350 + }, + { + "epoch": 0.8086359175662414, + "grad_norm": 0.9439699053764343, + "learning_rate": 9.990482350423929e-05, + "loss": 0.1131, + "step": 12360 + }, + { + "epoch": 0.8092901537455022, + "grad_norm": 0.9236170053482056, + "learning_rate": 9.990425614646874e-05, + "loss": 0.114, + "step": 12370 + }, + { + "epoch": 0.8099443899247628, + "grad_norm": 1.0061261653900146, + "learning_rate": 9.990368710429866e-05, + "loss": 0.1216, + "step": 12380 + }, + { + "epoch": 0.8105986261040236, + "grad_norm": 0.9143770933151245, + "learning_rate": 9.990311637774827e-05, + "loss": 0.1159, + "step": 12390 + }, + { + "epoch": 0.8112528622832843, + "grad_norm": 0.8878925442695618, + "learning_rate": 9.990254396683683e-05, + "loss": 0.1078, + "step": 12400 + }, + { + "epoch": 0.8119070984625449, + "grad_norm": 1.0045684576034546, + "learning_rate": 9.990196987158364e-05, + "loss": 0.1112, + "step": 12410 + }, + { + "epoch": 0.8125613346418057, + "grad_norm": 0.7909841537475586, + "learning_rate": 9.990139409200812e-05, + "loss": 0.1122, + "step": 12420 + }, + { + "epoch": 0.8132155708210664, + "grad_norm": 0.8620671629905701, + "learning_rate": 9.990081662812966e-05, + "loss": 0.121, + "step": 12430 + }, + { + "epoch": 0.8138698070003271, + "grad_norm": 0.8675190210342407, + "learning_rate": 9.990023747996777e-05, + "loss": 0.1225, + "step": 12440 + }, + { + "epoch": 0.8145240431795878, + "grad_norm": 1.0223960876464844, + "learning_rate": 9.9899656647542e-05, + "loss": 0.1217, + "step": 12450 + }, + { + "epoch": 0.8151782793588486, + "grad_norm": 0.850107729434967, + "learning_rate": 9.989907413087196e-05, + "loss": 0.1081, + "step": 12460 + }, + { + "epoch": 0.8158325155381092, + "grad_norm": 0.8945133090019226, + "learning_rate": 9.98984899299773e-05, + "loss": 0.1133, + "step": 12470 + }, + { + "epoch": 0.81648675171737, + "grad_norm": 0.8991668820381165, + "learning_rate": 9.989790404487773e-05, + "loss": 0.1104, + "step": 12480 + }, + { + "epoch": 0.8171409878966307, + "grad_norm": 0.7645007967948914, + "learning_rate": 9.989731647559304e-05, + "loss": 0.1179, + "step": 12490 + }, + { + "epoch": 0.8177952240758914, + "grad_norm": 0.7892476916313171, + "learning_rate": 9.989672722214307e-05, + "loss": 0.1097, + "step": 12500 + }, + { + "epoch": 0.8184494602551521, + "grad_norm": 0.882279098033905, + "learning_rate": 9.989613628454769e-05, + "loss": 0.1278, + "step": 12510 + }, + { + "epoch": 0.8191036964344128, + "grad_norm": 1.140187382698059, + "learning_rate": 9.989554366282684e-05, + "loss": 0.1163, + "step": 12520 + }, + { + "epoch": 0.8197579326136736, + "grad_norm": 0.8200512528419495, + "learning_rate": 9.989494935700054e-05, + "loss": 0.1209, + "step": 12530 + }, + { + "epoch": 0.8204121687929342, + "grad_norm": 0.9112234711647034, + "learning_rate": 9.989435336708886e-05, + "loss": 0.1156, + "step": 12540 + }, + { + "epoch": 0.821066404972195, + "grad_norm": 0.7766229510307312, + "learning_rate": 9.98937556931119e-05, + "loss": 0.1101, + "step": 12550 + }, + { + "epoch": 0.8217206411514557, + "grad_norm": 1.0036638975143433, + "learning_rate": 9.989315633508983e-05, + "loss": 0.1233, + "step": 12560 + }, + { + "epoch": 0.8223748773307163, + "grad_norm": 0.9545724391937256, + "learning_rate": 9.989255529304287e-05, + "loss": 0.1198, + "step": 12570 + }, + { + "epoch": 0.8230291135099771, + "grad_norm": 0.8219319581985474, + "learning_rate": 9.989195256699133e-05, + "loss": 0.1183, + "step": 12580 + }, + { + "epoch": 0.8236833496892378, + "grad_norm": 0.7603002190589905, + "learning_rate": 9.989134815695556e-05, + "loss": 0.1088, + "step": 12590 + }, + { + "epoch": 0.8243375858684985, + "grad_norm": 0.7731127738952637, + "learning_rate": 9.989074206295592e-05, + "loss": 0.1181, + "step": 12600 + }, + { + "epoch": 0.8249918220477592, + "grad_norm": 1.0608632564544678, + "learning_rate": 9.98901342850129e-05, + "loss": 0.1248, + "step": 12610 + }, + { + "epoch": 0.82564605822702, + "grad_norm": 0.9366574287414551, + "learning_rate": 9.988952482314702e-05, + "loss": 0.1197, + "step": 12620 + }, + { + "epoch": 0.8263002944062807, + "grad_norm": 0.8143541812896729, + "learning_rate": 9.988891367737882e-05, + "loss": 0.1033, + "step": 12630 + }, + { + "epoch": 0.8269545305855414, + "grad_norm": 1.157392144203186, + "learning_rate": 9.988830084772896e-05, + "loss": 0.1086, + "step": 12640 + }, + { + "epoch": 0.8276087667648021, + "grad_norm": 0.9264316558837891, + "learning_rate": 9.98876863342181e-05, + "loss": 0.11, + "step": 12650 + }, + { + "epoch": 0.8282630029440629, + "grad_norm": 0.9766997694969177, + "learning_rate": 9.988707013686698e-05, + "loss": 0.11, + "step": 12660 + }, + { + "epoch": 0.8289172391233235, + "grad_norm": 0.9637208580970764, + "learning_rate": 9.988645225569643e-05, + "loss": 0.1112, + "step": 12670 + }, + { + "epoch": 0.8295714753025842, + "grad_norm": 0.9603047370910645, + "learning_rate": 9.988583269072727e-05, + "loss": 0.1092, + "step": 12680 + }, + { + "epoch": 0.830225711481845, + "grad_norm": 0.8850484490394592, + "learning_rate": 9.988521144198043e-05, + "loss": 0.1286, + "step": 12690 + }, + { + "epoch": 0.8308799476611056, + "grad_norm": 1.1651179790496826, + "learning_rate": 9.988458850947689e-05, + "loss": 0.1156, + "step": 12700 + }, + { + "epoch": 0.8315341838403664, + "grad_norm": 1.13690185546875, + "learning_rate": 9.988396389323764e-05, + "loss": 0.1165, + "step": 12710 + }, + { + "epoch": 0.8321884200196271, + "grad_norm": 0.8526743054389954, + "learning_rate": 9.988333759328379e-05, + "loss": 0.1144, + "step": 12720 + }, + { + "epoch": 0.8328426561988878, + "grad_norm": 0.8829309344291687, + "learning_rate": 9.988270960963648e-05, + "loss": 0.1118, + "step": 12730 + }, + { + "epoch": 0.8334968923781485, + "grad_norm": 0.8683731555938721, + "learning_rate": 9.988207994231689e-05, + "loss": 0.1123, + "step": 12740 + }, + { + "epoch": 0.8341511285574092, + "grad_norm": 0.9223302602767944, + "learning_rate": 9.988144859134627e-05, + "loss": 0.1257, + "step": 12750 + }, + { + "epoch": 0.8348053647366699, + "grad_norm": 1.061397910118103, + "learning_rate": 9.988081555674596e-05, + "loss": 0.125, + "step": 12760 + }, + { + "epoch": 0.8354596009159306, + "grad_norm": 1.0193485021591187, + "learning_rate": 9.98801808385373e-05, + "loss": 0.1168, + "step": 12770 + }, + { + "epoch": 0.8361138370951914, + "grad_norm": 0.9661241769790649, + "learning_rate": 9.987954443674173e-05, + "loss": 0.1085, + "step": 12780 + }, + { + "epoch": 0.8367680732744521, + "grad_norm": 0.9959526658058167, + "learning_rate": 9.98789063513807e-05, + "loss": 0.1082, + "step": 12790 + }, + { + "epoch": 0.8374223094537128, + "grad_norm": 0.7342641353607178, + "learning_rate": 9.987826658247579e-05, + "loss": 0.1187, + "step": 12800 + }, + { + "epoch": 0.8380765456329735, + "grad_norm": 0.9340495467185974, + "learning_rate": 9.987762513004856e-05, + "loss": 0.1022, + "step": 12810 + }, + { + "epoch": 0.8387307818122343, + "grad_norm": 0.9463670253753662, + "learning_rate": 9.98769819941207e-05, + "loss": 0.1246, + "step": 12820 + }, + { + "epoch": 0.8393850179914949, + "grad_norm": 0.8426292538642883, + "learning_rate": 9.987633717471385e-05, + "loss": 0.1077, + "step": 12830 + }, + { + "epoch": 0.8400392541707556, + "grad_norm": 1.0582283735275269, + "learning_rate": 9.987569067184983e-05, + "loss": 0.1215, + "step": 12840 + }, + { + "epoch": 0.8406934903500164, + "grad_norm": 0.9140653610229492, + "learning_rate": 9.987504248555047e-05, + "loss": 0.1193, + "step": 12850 + }, + { + "epoch": 0.841347726529277, + "grad_norm": 0.843436598777771, + "learning_rate": 9.98743926158376e-05, + "loss": 0.1243, + "step": 12860 + }, + { + "epoch": 0.8420019627085378, + "grad_norm": 0.8201360702514648, + "learning_rate": 9.987374106273318e-05, + "loss": 0.1106, + "step": 12870 + }, + { + "epoch": 0.8426561988877985, + "grad_norm": 1.0431125164031982, + "learning_rate": 9.987308782625919e-05, + "loss": 0.1182, + "step": 12880 + }, + { + "epoch": 0.8433104350670592, + "grad_norm": 0.8388211727142334, + "learning_rate": 9.98724329064377e-05, + "loss": 0.1093, + "step": 12890 + }, + { + "epoch": 0.8439646712463199, + "grad_norm": 0.8879048228263855, + "learning_rate": 9.987177630329081e-05, + "loss": 0.1004, + "step": 12900 + }, + { + "epoch": 0.8446189074255807, + "grad_norm": 0.857945442199707, + "learning_rate": 9.987111801684068e-05, + "loss": 0.1107, + "step": 12910 + }, + { + "epoch": 0.8452731436048413, + "grad_norm": 0.7848942875862122, + "learning_rate": 9.987045804710951e-05, + "loss": 0.1112, + "step": 12920 + }, + { + "epoch": 0.845927379784102, + "grad_norm": 0.8898484706878662, + "learning_rate": 9.98697963941196e-05, + "loss": 0.114, + "step": 12930 + }, + { + "epoch": 0.8465816159633628, + "grad_norm": 1.1552084684371948, + "learning_rate": 9.986913305789328e-05, + "loss": 0.1143, + "step": 12940 + }, + { + "epoch": 0.8472358521426235, + "grad_norm": 0.8362104296684265, + "learning_rate": 9.986846803845291e-05, + "loss": 0.1142, + "step": 12950 + }, + { + "epoch": 0.8478900883218842, + "grad_norm": 1.0865123271942139, + "learning_rate": 9.9867801335821e-05, + "loss": 0.1147, + "step": 12960 + }, + { + "epoch": 0.8485443245011449, + "grad_norm": 0.8024462461471558, + "learning_rate": 9.986713295001997e-05, + "loss": 0.1101, + "step": 12970 + }, + { + "epoch": 0.8491985606804057, + "grad_norm": 0.960501492023468, + "learning_rate": 9.986646288107243e-05, + "loss": 0.1215, + "step": 12980 + }, + { + "epoch": 0.8498527968596663, + "grad_norm": 0.7977132797241211, + "learning_rate": 9.9865791129001e-05, + "loss": 0.1175, + "step": 12990 + }, + { + "epoch": 0.850507033038927, + "grad_norm": 0.913974404335022, + "learning_rate": 9.986511769382834e-05, + "loss": 0.1115, + "step": 13000 + }, + { + "epoch": 0.8511612692181878, + "grad_norm": 0.86736661195755, + "learning_rate": 9.986444257557717e-05, + "loss": 0.1113, + "step": 13010 + }, + { + "epoch": 0.8518155053974484, + "grad_norm": 0.8728740811347961, + "learning_rate": 9.98637657742703e-05, + "loss": 0.1113, + "step": 13020 + }, + { + "epoch": 0.8524697415767092, + "grad_norm": 0.8732495903968811, + "learning_rate": 9.986308728993056e-05, + "loss": 0.1129, + "step": 13030 + }, + { + "epoch": 0.8531239777559699, + "grad_norm": 0.9430647492408752, + "learning_rate": 9.986240712258085e-05, + "loss": 0.1065, + "step": 13040 + }, + { + "epoch": 0.8537782139352306, + "grad_norm": 0.849603533744812, + "learning_rate": 9.986172527224413e-05, + "loss": 0.1198, + "step": 13050 + }, + { + "epoch": 0.8544324501144913, + "grad_norm": 0.75860595703125, + "learning_rate": 9.986104173894342e-05, + "loss": 0.1063, + "step": 13060 + }, + { + "epoch": 0.8550866862937521, + "grad_norm": 0.916583240032196, + "learning_rate": 9.986035652270178e-05, + "loss": 0.1179, + "step": 13070 + }, + { + "epoch": 0.8557409224730128, + "grad_norm": 0.7773154377937317, + "learning_rate": 9.985966962354235e-05, + "loss": 0.1102, + "step": 13080 + }, + { + "epoch": 0.8563951586522734, + "grad_norm": 0.7246621251106262, + "learning_rate": 9.985898104148831e-05, + "loss": 0.1108, + "step": 13090 + }, + { + "epoch": 0.8570493948315342, + "grad_norm": 0.7535557746887207, + "learning_rate": 9.985829077656291e-05, + "loss": 0.1099, + "step": 13100 + }, + { + "epoch": 0.8577036310107949, + "grad_norm": 0.8509471416473389, + "learning_rate": 9.98575988287894e-05, + "loss": 0.0995, + "step": 13110 + }, + { + "epoch": 0.8583578671900556, + "grad_norm": 0.7079533934593201, + "learning_rate": 9.98569051981912e-05, + "loss": 0.1189, + "step": 13120 + }, + { + "epoch": 0.8590121033693163, + "grad_norm": 0.9748969674110413, + "learning_rate": 9.985620988479169e-05, + "loss": 0.1168, + "step": 13130 + }, + { + "epoch": 0.8596663395485771, + "grad_norm": 0.9905074834823608, + "learning_rate": 9.985551288861435e-05, + "loss": 0.1155, + "step": 13140 + }, + { + "epoch": 0.8603205757278377, + "grad_norm": 0.7859392762184143, + "learning_rate": 9.98548142096827e-05, + "loss": 0.1111, + "step": 13150 + }, + { + "epoch": 0.8609748119070985, + "grad_norm": 0.7785911560058594, + "learning_rate": 9.985411384802031e-05, + "loss": 0.1159, + "step": 13160 + }, + { + "epoch": 0.8616290480863592, + "grad_norm": 1.0877043008804321, + "learning_rate": 9.985341180365084e-05, + "loss": 0.1197, + "step": 13170 + }, + { + "epoch": 0.8622832842656198, + "grad_norm": 0.8900846838951111, + "learning_rate": 9.985270807659798e-05, + "loss": 0.1181, + "step": 13180 + }, + { + "epoch": 0.8629375204448806, + "grad_norm": 0.7488992810249329, + "learning_rate": 9.985200266688546e-05, + "loss": 0.1058, + "step": 13190 + }, + { + "epoch": 0.8635917566241413, + "grad_norm": 0.7488667964935303, + "learning_rate": 9.985129557453713e-05, + "loss": 0.1136, + "step": 13200 + }, + { + "epoch": 0.864245992803402, + "grad_norm": 1.060916543006897, + "learning_rate": 9.985058679957681e-05, + "loss": 0.1169, + "step": 13210 + }, + { + "epoch": 0.8649002289826627, + "grad_norm": 1.009759783744812, + "learning_rate": 9.984987634202847e-05, + "loss": 0.1154, + "step": 13220 + }, + { + "epoch": 0.8655544651619235, + "grad_norm": 1.0468559265136719, + "learning_rate": 9.984916420191607e-05, + "loss": 0.1205, + "step": 13230 + }, + { + "epoch": 0.8662087013411842, + "grad_norm": 0.9158945083618164, + "learning_rate": 9.984845037926362e-05, + "loss": 0.1076, + "step": 13240 + }, + { + "epoch": 0.8668629375204449, + "grad_norm": 0.893125057220459, + "learning_rate": 9.984773487409527e-05, + "loss": 0.1189, + "step": 13250 + }, + { + "epoch": 0.8675171736997056, + "grad_norm": 0.9089499711990356, + "learning_rate": 9.984701768643512e-05, + "loss": 0.1033, + "step": 13260 + }, + { + "epoch": 0.8681714098789663, + "grad_norm": 0.9404613375663757, + "learning_rate": 9.984629881630738e-05, + "loss": 0.1225, + "step": 13270 + }, + { + "epoch": 0.868825646058227, + "grad_norm": 0.8523469567298889, + "learning_rate": 9.984557826373635e-05, + "loss": 0.1146, + "step": 13280 + }, + { + "epoch": 0.8694798822374877, + "grad_norm": 0.9582688212394714, + "learning_rate": 9.984485602874632e-05, + "loss": 0.1278, + "step": 13290 + }, + { + "epoch": 0.8701341184167485, + "grad_norm": 0.8448380827903748, + "learning_rate": 9.984413211136167e-05, + "loss": 0.1224, + "step": 13300 + }, + { + "epoch": 0.8707883545960091, + "grad_norm": 0.8751958608627319, + "learning_rate": 9.984340651160685e-05, + "loss": 0.1088, + "step": 13310 + }, + { + "epoch": 0.8714425907752699, + "grad_norm": 0.7682243585586548, + "learning_rate": 9.984267922950634e-05, + "loss": 0.1048, + "step": 13320 + }, + { + "epoch": 0.8720968269545306, + "grad_norm": 0.9508857727050781, + "learning_rate": 9.984195026508469e-05, + "loss": 0.1124, + "step": 13330 + }, + { + "epoch": 0.8727510631337912, + "grad_norm": 1.0506585836410522, + "learning_rate": 9.98412196183665e-05, + "loss": 0.1266, + "step": 13340 + }, + { + "epoch": 0.873405299313052, + "grad_norm": 1.0263482332229614, + "learning_rate": 9.984048728937643e-05, + "loss": 0.1313, + "step": 13350 + }, + { + "epoch": 0.8740595354923127, + "grad_norm": 0.8746235370635986, + "learning_rate": 9.98397532781392e-05, + "loss": 0.1066, + "step": 13360 + }, + { + "epoch": 0.8747137716715735, + "grad_norm": 0.797562301158905, + "learning_rate": 9.98390175846796e-05, + "loss": 0.1052, + "step": 13370 + }, + { + "epoch": 0.8753680078508341, + "grad_norm": 1.0147572755813599, + "learning_rate": 9.983828020902244e-05, + "loss": 0.1073, + "step": 13380 + }, + { + "epoch": 0.8760222440300949, + "grad_norm": 1.0348169803619385, + "learning_rate": 9.983754115119261e-05, + "loss": 0.1168, + "step": 13390 + }, + { + "epoch": 0.8766764802093556, + "grad_norm": 0.8402387499809265, + "learning_rate": 9.983680041121509e-05, + "loss": 0.1213, + "step": 13400 + }, + { + "epoch": 0.8773307163886163, + "grad_norm": 0.8802344799041748, + "learning_rate": 9.983605798911484e-05, + "loss": 0.1121, + "step": 13410 + }, + { + "epoch": 0.877984952567877, + "grad_norm": 0.7464808821678162, + "learning_rate": 9.983531388491691e-05, + "loss": 0.1184, + "step": 13420 + }, + { + "epoch": 0.8786391887471378, + "grad_norm": 0.8094585537910461, + "learning_rate": 9.983456809864646e-05, + "loss": 0.1138, + "step": 13430 + }, + { + "epoch": 0.8792934249263984, + "grad_norm": 0.7302922010421753, + "learning_rate": 9.983382063032864e-05, + "loss": 0.1124, + "step": 13440 + }, + { + "epoch": 0.8799476611056591, + "grad_norm": 0.8307011723518372, + "learning_rate": 9.983307147998868e-05, + "loss": 0.1155, + "step": 13450 + }, + { + "epoch": 0.8806018972849199, + "grad_norm": 0.8752557635307312, + "learning_rate": 9.983232064765187e-05, + "loss": 0.1097, + "step": 13460 + }, + { + "epoch": 0.8812561334641805, + "grad_norm": 0.9539555907249451, + "learning_rate": 9.983156813334354e-05, + "loss": 0.1194, + "step": 13470 + }, + { + "epoch": 0.8819103696434413, + "grad_norm": 0.9456961750984192, + "learning_rate": 9.983081393708911e-05, + "loss": 0.1129, + "step": 13480 + }, + { + "epoch": 0.882564605822702, + "grad_norm": 1.089903473854065, + "learning_rate": 9.983005805891401e-05, + "loss": 0.1114, + "step": 13490 + }, + { + "epoch": 0.8832188420019627, + "grad_norm": 0.9316903948783875, + "learning_rate": 9.982930049884377e-05, + "loss": 0.1132, + "step": 13500 + }, + { + "epoch": 0.8838730781812234, + "grad_norm": 0.8528069853782654, + "learning_rate": 9.982854125690395e-05, + "loss": 0.1188, + "step": 13510 + }, + { + "epoch": 0.8845273143604842, + "grad_norm": 0.8563151955604553, + "learning_rate": 9.982778033312019e-05, + "loss": 0.1122, + "step": 13520 + }, + { + "epoch": 0.8851815505397449, + "grad_norm": 0.7810546159744263, + "learning_rate": 9.982701772751816e-05, + "loss": 0.1076, + "step": 13530 + }, + { + "epoch": 0.8858357867190055, + "grad_norm": 0.7881696820259094, + "learning_rate": 9.982625344012361e-05, + "loss": 0.1293, + "step": 13540 + }, + { + "epoch": 0.8864900228982663, + "grad_norm": 0.7264038324356079, + "learning_rate": 9.982548747096235e-05, + "loss": 0.1156, + "step": 13550 + }, + { + "epoch": 0.887144259077527, + "grad_norm": 1.067252516746521, + "learning_rate": 9.982471982006019e-05, + "loss": 0.1142, + "step": 13560 + }, + { + "epoch": 0.8877984952567877, + "grad_norm": 0.9019017815589905, + "learning_rate": 9.982395048744307e-05, + "loss": 0.1066, + "step": 13570 + }, + { + "epoch": 0.8884527314360484, + "grad_norm": 1.020104169845581, + "learning_rate": 9.982317947313695e-05, + "loss": 0.1193, + "step": 13580 + }, + { + "epoch": 0.8891069676153092, + "grad_norm": 0.8307511806488037, + "learning_rate": 9.982240677716788e-05, + "loss": 0.1249, + "step": 13590 + }, + { + "epoch": 0.8897612037945698, + "grad_norm": 0.9394966959953308, + "learning_rate": 9.98216323995619e-05, + "loss": 0.1158, + "step": 13600 + }, + { + "epoch": 0.8904154399738305, + "grad_norm": 0.9355982542037964, + "learning_rate": 9.982085634034515e-05, + "loss": 0.1243, + "step": 13610 + }, + { + "epoch": 0.8910696761530913, + "grad_norm": 0.9257426857948303, + "learning_rate": 9.982007859954386e-05, + "loss": 0.1079, + "step": 13620 + }, + { + "epoch": 0.8917239123323519, + "grad_norm": 1.074717402458191, + "learning_rate": 9.981929917718426e-05, + "loss": 0.1144, + "step": 13630 + }, + { + "epoch": 0.8923781485116127, + "grad_norm": 0.8805826902389526, + "learning_rate": 9.981851807329264e-05, + "loss": 0.117, + "step": 13640 + }, + { + "epoch": 0.8930323846908734, + "grad_norm": 1.0279000997543335, + "learning_rate": 9.98177352878954e-05, + "loss": 0.1132, + "step": 13650 + }, + { + "epoch": 0.8936866208701341, + "grad_norm": 1.029099941253662, + "learning_rate": 9.981695082101893e-05, + "loss": 0.1125, + "step": 13660 + }, + { + "epoch": 0.8943408570493948, + "grad_norm": 1.044093132019043, + "learning_rate": 9.981616467268973e-05, + "loss": 0.1163, + "step": 13670 + }, + { + "epoch": 0.8949950932286556, + "grad_norm": 0.9173268675804138, + "learning_rate": 9.98153768429343e-05, + "loss": 0.1133, + "step": 13680 + }, + { + "epoch": 0.8956493294079163, + "grad_norm": 0.917991578578949, + "learning_rate": 9.981458733177928e-05, + "loss": 0.1043, + "step": 13690 + }, + { + "epoch": 0.8963035655871769, + "grad_norm": 0.9324220418930054, + "learning_rate": 9.981379613925129e-05, + "loss": 0.1145, + "step": 13700 + }, + { + "epoch": 0.8969578017664377, + "grad_norm": 0.9457996487617493, + "learning_rate": 9.981300326537704e-05, + "loss": 0.1145, + "step": 13710 + }, + { + "epoch": 0.8976120379456984, + "grad_norm": 0.9472202062606812, + "learning_rate": 9.981220871018329e-05, + "loss": 0.1176, + "step": 13720 + }, + { + "epoch": 0.8982662741249591, + "grad_norm": 0.8660651445388794, + "learning_rate": 9.981141247369685e-05, + "loss": 0.1161, + "step": 13730 + }, + { + "epoch": 0.8989205103042198, + "grad_norm": 0.9130904674530029, + "learning_rate": 9.981061455594461e-05, + "loss": 0.1095, + "step": 13740 + }, + { + "epoch": 0.8995747464834806, + "grad_norm": 1.0127148628234863, + "learning_rate": 9.980981495695349e-05, + "loss": 0.1115, + "step": 13750 + }, + { + "epoch": 0.9002289826627412, + "grad_norm": 0.8180867433547974, + "learning_rate": 9.980901367675048e-05, + "loss": 0.1051, + "step": 13760 + }, + { + "epoch": 0.900883218842002, + "grad_norm": 0.8807599544525146, + "learning_rate": 9.980821071536264e-05, + "loss": 0.1317, + "step": 13770 + }, + { + "epoch": 0.9015374550212627, + "grad_norm": 0.8612924218177795, + "learning_rate": 9.980740607281707e-05, + "loss": 0.1147, + "step": 13780 + }, + { + "epoch": 0.9021916912005233, + "grad_norm": 0.9025987386703491, + "learning_rate": 9.980659974914091e-05, + "loss": 0.1103, + "step": 13790 + }, + { + "epoch": 0.9028459273797841, + "grad_norm": 0.8930261731147766, + "learning_rate": 9.980579174436138e-05, + "loss": 0.1101, + "step": 13800 + }, + { + "epoch": 0.9035001635590448, + "grad_norm": 0.8868409395217896, + "learning_rate": 9.980498205850577e-05, + "loss": 0.1085, + "step": 13810 + }, + { + "epoch": 0.9041543997383056, + "grad_norm": 0.7958750128746033, + "learning_rate": 9.980417069160139e-05, + "loss": 0.1128, + "step": 13820 + }, + { + "epoch": 0.9048086359175662, + "grad_norm": 1.0320643186569214, + "learning_rate": 9.980335764367563e-05, + "loss": 0.1179, + "step": 13830 + }, + { + "epoch": 0.905462872096827, + "grad_norm": 0.9353842735290527, + "learning_rate": 9.980254291475595e-05, + "loss": 0.1012, + "step": 13840 + }, + { + "epoch": 0.9061171082760877, + "grad_norm": 0.8081122636795044, + "learning_rate": 9.980172650486983e-05, + "loss": 0.1129, + "step": 13850 + }, + { + "epoch": 0.9067713444553483, + "grad_norm": 0.7655879259109497, + "learning_rate": 9.980090841404482e-05, + "loss": 0.1043, + "step": 13860 + }, + { + "epoch": 0.9074255806346091, + "grad_norm": 0.886113703250885, + "learning_rate": 9.980008864230854e-05, + "loss": 0.1072, + "step": 13870 + }, + { + "epoch": 0.9080798168138698, + "grad_norm": 0.9072959423065186, + "learning_rate": 9.979926718968868e-05, + "loss": 0.1132, + "step": 13880 + }, + { + "epoch": 0.9087340529931305, + "grad_norm": 1.0498781204223633, + "learning_rate": 9.979844405621295e-05, + "loss": 0.1179, + "step": 13890 + }, + { + "epoch": 0.9093882891723912, + "grad_norm": 0.9316992163658142, + "learning_rate": 9.979761924190911e-05, + "loss": 0.1127, + "step": 13900 + }, + { + "epoch": 0.910042525351652, + "grad_norm": 0.9296565055847168, + "learning_rate": 9.979679274680504e-05, + "loss": 0.11, + "step": 13910 + }, + { + "epoch": 0.9106967615309126, + "grad_norm": 1.0889675617218018, + "learning_rate": 9.979596457092861e-05, + "loss": 0.1027, + "step": 13920 + }, + { + "epoch": 0.9113509977101734, + "grad_norm": 0.742987871170044, + "learning_rate": 9.979513471430779e-05, + "loss": 0.1047, + "step": 13930 + }, + { + "epoch": 0.9120052338894341, + "grad_norm": 0.8663235902786255, + "learning_rate": 9.979430317697056e-05, + "loss": 0.1226, + "step": 13940 + }, + { + "epoch": 0.9126594700686947, + "grad_norm": 1.0001239776611328, + "learning_rate": 9.979346995894504e-05, + "loss": 0.1174, + "step": 13950 + }, + { + "epoch": 0.9133137062479555, + "grad_norm": 1.0531165599822998, + "learning_rate": 9.979263506025929e-05, + "loss": 0.1148, + "step": 13960 + }, + { + "epoch": 0.9139679424272162, + "grad_norm": 0.8810727000236511, + "learning_rate": 9.979179848094153e-05, + "loss": 0.1153, + "step": 13970 + }, + { + "epoch": 0.914622178606477, + "grad_norm": 0.8835552930831909, + "learning_rate": 9.979096022102e-05, + "loss": 0.1089, + "step": 13980 + }, + { + "epoch": 0.9152764147857376, + "grad_norm": 0.9073536396026611, + "learning_rate": 9.979012028052297e-05, + "loss": 0.1254, + "step": 13990 + }, + { + "epoch": 0.9159306509649984, + "grad_norm": 0.8263078331947327, + "learning_rate": 9.97892786594788e-05, + "loss": 0.1104, + "step": 14000 + }, + { + "epoch": 0.9165848871442591, + "grad_norm": 0.9409777522087097, + "learning_rate": 9.978843535791588e-05, + "loss": 0.1119, + "step": 14010 + }, + { + "epoch": 0.9172391233235198, + "grad_norm": 0.8402562737464905, + "learning_rate": 9.978759037586272e-05, + "loss": 0.1134, + "step": 14020 + }, + { + "epoch": 0.9178933595027805, + "grad_norm": 0.8777570724487305, + "learning_rate": 9.978674371334782e-05, + "loss": 0.1077, + "step": 14030 + }, + { + "epoch": 0.9185475956820413, + "grad_norm": 1.0120798349380493, + "learning_rate": 9.978589537039972e-05, + "loss": 0.1179, + "step": 14040 + }, + { + "epoch": 0.9192018318613019, + "grad_norm": 0.7693895101547241, + "learning_rate": 9.97850453470471e-05, + "loss": 0.1248, + "step": 14050 + }, + { + "epoch": 0.9198560680405626, + "grad_norm": 0.8541615009307861, + "learning_rate": 9.978419364331863e-05, + "loss": 0.1226, + "step": 14060 + }, + { + "epoch": 0.9205103042198234, + "grad_norm": 1.0362035036087036, + "learning_rate": 9.978334025924307e-05, + "loss": 0.1274, + "step": 14070 + }, + { + "epoch": 0.921164540399084, + "grad_norm": 0.9056493639945984, + "learning_rate": 9.97824851948492e-05, + "loss": 0.1163, + "step": 14080 + }, + { + "epoch": 0.9218187765783448, + "grad_norm": 0.9539541602134705, + "learning_rate": 9.97816284501659e-05, + "loss": 0.1117, + "step": 14090 + }, + { + "epoch": 0.9224730127576055, + "grad_norm": 0.8763441443443298, + "learning_rate": 9.978077002522208e-05, + "loss": 0.1187, + "step": 14100 + }, + { + "epoch": 0.9231272489368663, + "grad_norm": 1.0391874313354492, + "learning_rate": 9.977990992004672e-05, + "loss": 0.1131, + "step": 14110 + }, + { + "epoch": 0.9237814851161269, + "grad_norm": 0.8817654252052307, + "learning_rate": 9.977904813466885e-05, + "loss": 0.1106, + "step": 14120 + }, + { + "epoch": 0.9244357212953876, + "grad_norm": 0.7447488903999329, + "learning_rate": 9.977818466911754e-05, + "loss": 0.1042, + "step": 14130 + }, + { + "epoch": 0.9250899574746484, + "grad_norm": 0.9143378734588623, + "learning_rate": 9.977731952342198e-05, + "loss": 0.1299, + "step": 14140 + }, + { + "epoch": 0.925744193653909, + "grad_norm": 0.9226405024528503, + "learning_rate": 9.97764526976113e-05, + "loss": 0.1058, + "step": 14150 + }, + { + "epoch": 0.9263984298331698, + "grad_norm": 0.7661489844322205, + "learning_rate": 9.977558419171485e-05, + "loss": 0.1109, + "step": 14160 + }, + { + "epoch": 0.9270526660124305, + "grad_norm": 0.8209551572799683, + "learning_rate": 9.977471400576185e-05, + "loss": 0.1158, + "step": 14170 + }, + { + "epoch": 0.9277069021916912, + "grad_norm": 0.9053903818130493, + "learning_rate": 9.977384213978173e-05, + "loss": 0.1041, + "step": 14180 + }, + { + "epoch": 0.9283611383709519, + "grad_norm": 0.9400367140769958, + "learning_rate": 9.97729685938039e-05, + "loss": 0.1077, + "step": 14190 + }, + { + "epoch": 0.9290153745502127, + "grad_norm": 0.9870936274528503, + "learning_rate": 9.977209336785783e-05, + "loss": 0.1053, + "step": 14200 + }, + { + "epoch": 0.9296696107294733, + "grad_norm": 0.9150540828704834, + "learning_rate": 9.977121646197309e-05, + "loss": 0.1115, + "step": 14210 + }, + { + "epoch": 0.930323846908734, + "grad_norm": 1.1972026824951172, + "learning_rate": 9.977033787617927e-05, + "loss": 0.1049, + "step": 14220 + }, + { + "epoch": 0.9309780830879948, + "grad_norm": 0.9222517013549805, + "learning_rate": 9.9769457610506e-05, + "loss": 0.1094, + "step": 14230 + }, + { + "epoch": 0.9316323192672554, + "grad_norm": 0.763886034488678, + "learning_rate": 9.976857566498303e-05, + "loss": 0.1083, + "step": 14240 + }, + { + "epoch": 0.9322865554465162, + "grad_norm": 0.8632875084877014, + "learning_rate": 9.976769203964011e-05, + "loss": 0.1082, + "step": 14250 + }, + { + "epoch": 0.9329407916257769, + "grad_norm": 1.0797144174575806, + "learning_rate": 9.976680673450704e-05, + "loss": 0.1119, + "step": 14260 + }, + { + "epoch": 0.9335950278050377, + "grad_norm": 0.7785837054252625, + "learning_rate": 9.976591974961376e-05, + "loss": 0.1061, + "step": 14270 + }, + { + "epoch": 0.9342492639842983, + "grad_norm": 0.9709704518318176, + "learning_rate": 9.976503108499014e-05, + "loss": 0.1245, + "step": 14280 + }, + { + "epoch": 0.934903500163559, + "grad_norm": 0.8645631670951843, + "learning_rate": 9.976414074066622e-05, + "loss": 0.1041, + "step": 14290 + }, + { + "epoch": 0.9355577363428198, + "grad_norm": 0.8737794756889343, + "learning_rate": 9.976324871667204e-05, + "loss": 0.1068, + "step": 14300 + }, + { + "epoch": 0.9362119725220804, + "grad_norm": 0.9471864104270935, + "learning_rate": 9.97623550130377e-05, + "loss": 0.1132, + "step": 14310 + }, + { + "epoch": 0.9368662087013412, + "grad_norm": 0.9491965770721436, + "learning_rate": 9.976145962979337e-05, + "loss": 0.1086, + "step": 14320 + }, + { + "epoch": 0.9375204448806019, + "grad_norm": 1.008070468902588, + "learning_rate": 9.976056256696928e-05, + "loss": 0.1133, + "step": 14330 + }, + { + "epoch": 0.9381746810598626, + "grad_norm": 0.8952319025993347, + "learning_rate": 9.975966382459572e-05, + "loss": 0.1053, + "step": 14340 + }, + { + "epoch": 0.9388289172391233, + "grad_norm": 0.8078186511993408, + "learning_rate": 9.975876340270298e-05, + "loss": 0.1079, + "step": 14350 + }, + { + "epoch": 0.9394831534183841, + "grad_norm": 1.273657202720642, + "learning_rate": 9.975786130132148e-05, + "loss": 0.1111, + "step": 14360 + }, + { + "epoch": 0.9401373895976447, + "grad_norm": 0.9750173091888428, + "learning_rate": 9.975695752048168e-05, + "loss": 0.1207, + "step": 14370 + }, + { + "epoch": 0.9407916257769054, + "grad_norm": 0.7701619267463684, + "learning_rate": 9.975605206021406e-05, + "loss": 0.1098, + "step": 14380 + }, + { + "epoch": 0.9414458619561662, + "grad_norm": 0.8776541352272034, + "learning_rate": 9.97551449205492e-05, + "loss": 0.1126, + "step": 14390 + }, + { + "epoch": 0.9421000981354269, + "grad_norm": 0.8586710691452026, + "learning_rate": 9.975423610151771e-05, + "loss": 0.1052, + "step": 14400 + }, + { + "epoch": 0.9427543343146876, + "grad_norm": 0.9918507933616638, + "learning_rate": 9.975332560315026e-05, + "loss": 0.1105, + "step": 14410 + }, + { + "epoch": 0.9434085704939483, + "grad_norm": 0.8050119280815125, + "learning_rate": 9.97524134254776e-05, + "loss": 0.1009, + "step": 14420 + }, + { + "epoch": 0.9440628066732091, + "grad_norm": 0.8901785612106323, + "learning_rate": 9.975149956853049e-05, + "loss": 0.1104, + "step": 14430 + }, + { + "epoch": 0.9447170428524697, + "grad_norm": 1.129379391670227, + "learning_rate": 9.975058403233981e-05, + "loss": 0.122, + "step": 14440 + }, + { + "epoch": 0.9453712790317305, + "grad_norm": 0.946667492389679, + "learning_rate": 9.974966681693642e-05, + "loss": 0.1093, + "step": 14450 + }, + { + "epoch": 0.9460255152109912, + "grad_norm": 1.037529468536377, + "learning_rate": 9.974874792235131e-05, + "loss": 0.12, + "step": 14460 + }, + { + "epoch": 0.9466797513902518, + "grad_norm": 0.9789792895317078, + "learning_rate": 9.97478273486155e-05, + "loss": 0.1238, + "step": 14470 + }, + { + "epoch": 0.9473339875695126, + "grad_norm": 0.9181520938873291, + "learning_rate": 9.974690509576005e-05, + "loss": 0.1214, + "step": 14480 + }, + { + "epoch": 0.9479882237487733, + "grad_norm": 0.8259325623512268, + "learning_rate": 9.974598116381608e-05, + "loss": 0.114, + "step": 14490 + }, + { + "epoch": 0.948642459928034, + "grad_norm": 0.8979782462120056, + "learning_rate": 9.974505555281476e-05, + "loss": 0.117, + "step": 14500 + }, + { + "epoch": 0.9492966961072947, + "grad_norm": 0.9124277234077454, + "learning_rate": 9.974412826278738e-05, + "loss": 0.1052, + "step": 14510 + }, + { + "epoch": 0.9499509322865555, + "grad_norm": 1.0142040252685547, + "learning_rate": 9.974319929376522e-05, + "loss": 0.1166, + "step": 14520 + }, + { + "epoch": 0.9506051684658161, + "grad_norm": 0.973535418510437, + "learning_rate": 9.974226864577961e-05, + "loss": 0.1059, + "step": 14530 + }, + { + "epoch": 0.9512594046450769, + "grad_norm": 0.8721091151237488, + "learning_rate": 9.974133631886198e-05, + "loss": 0.1119, + "step": 14540 + }, + { + "epoch": 0.9519136408243376, + "grad_norm": 1.0169627666473389, + "learning_rate": 9.97404023130438e-05, + "loss": 0.1231, + "step": 14550 + }, + { + "epoch": 0.9525678770035984, + "grad_norm": 0.861275315284729, + "learning_rate": 9.973946662835658e-05, + "loss": 0.1046, + "step": 14560 + }, + { + "epoch": 0.953222113182859, + "grad_norm": 1.012182593345642, + "learning_rate": 9.973852926483194e-05, + "loss": 0.112, + "step": 14570 + }, + { + "epoch": 0.9538763493621197, + "grad_norm": 0.804041862487793, + "learning_rate": 9.973759022250147e-05, + "loss": 0.1076, + "step": 14580 + }, + { + "epoch": 0.9545305855413805, + "grad_norm": 0.799505889415741, + "learning_rate": 9.97366495013969e-05, + "loss": 0.1137, + "step": 14590 + }, + { + "epoch": 0.9551848217206411, + "grad_norm": 0.8227983713150024, + "learning_rate": 9.973570710154998e-05, + "loss": 0.0982, + "step": 14600 + }, + { + "epoch": 0.9558390578999019, + "grad_norm": 0.9317682385444641, + "learning_rate": 9.973476302299249e-05, + "loss": 0.1103, + "step": 14610 + }, + { + "epoch": 0.9564932940791626, + "grad_norm": 0.8401086926460266, + "learning_rate": 9.973381726575632e-05, + "loss": 0.1188, + "step": 14620 + }, + { + "epoch": 0.9571475302584233, + "grad_norm": 0.9187564849853516, + "learning_rate": 9.97328698298734e-05, + "loss": 0.1242, + "step": 14630 + }, + { + "epoch": 0.957801766437684, + "grad_norm": 1.0764808654785156, + "learning_rate": 9.973192071537567e-05, + "loss": 0.112, + "step": 14640 + }, + { + "epoch": 0.9584560026169447, + "grad_norm": 0.947796642780304, + "learning_rate": 9.97309699222952e-05, + "loss": 0.1048, + "step": 14650 + }, + { + "epoch": 0.9591102387962054, + "grad_norm": 0.8203085660934448, + "learning_rate": 9.973001745066408e-05, + "loss": 0.1156, + "step": 14660 + }, + { + "epoch": 0.9597644749754661, + "grad_norm": 0.7854580879211426, + "learning_rate": 9.972906330051444e-05, + "loss": 0.1097, + "step": 14670 + }, + { + "epoch": 0.9604187111547269, + "grad_norm": 0.904597818851471, + "learning_rate": 9.97281074718785e-05, + "loss": 0.1062, + "step": 14680 + }, + { + "epoch": 0.9610729473339875, + "grad_norm": 0.8809137940406799, + "learning_rate": 9.972714996478851e-05, + "loss": 0.1027, + "step": 14690 + }, + { + "epoch": 0.9617271835132483, + "grad_norm": 0.8244183659553528, + "learning_rate": 9.972619077927679e-05, + "loss": 0.1032, + "step": 14700 + }, + { + "epoch": 0.962381419692509, + "grad_norm": 0.7864908576011658, + "learning_rate": 9.972522991537573e-05, + "loss": 0.0975, + "step": 14710 + }, + { + "epoch": 0.9630356558717698, + "grad_norm": 0.8401934504508972, + "learning_rate": 9.972426737311774e-05, + "loss": 0.1166, + "step": 14720 + }, + { + "epoch": 0.9636898920510304, + "grad_norm": 0.8961406350135803, + "learning_rate": 9.972330315253534e-05, + "loss": 0.1121, + "step": 14730 + }, + { + "epoch": 0.9643441282302911, + "grad_norm": 0.9105174541473389, + "learning_rate": 9.972233725366102e-05, + "loss": 0.1047, + "step": 14740 + }, + { + "epoch": 0.9649983644095519, + "grad_norm": 0.9146085977554321, + "learning_rate": 9.972136967652746e-05, + "loss": 0.1086, + "step": 14750 + }, + { + "epoch": 0.9656526005888125, + "grad_norm": 0.8616661429405212, + "learning_rate": 9.972040042116724e-05, + "loss": 0.1113, + "step": 14760 + }, + { + "epoch": 0.9663068367680733, + "grad_norm": 0.9188449382781982, + "learning_rate": 9.971942948761313e-05, + "loss": 0.1171, + "step": 14770 + }, + { + "epoch": 0.966961072947334, + "grad_norm": 0.9797625541687012, + "learning_rate": 9.971845687589786e-05, + "loss": 0.1142, + "step": 14780 + }, + { + "epoch": 0.9676153091265947, + "grad_norm": 0.9933854341506958, + "learning_rate": 9.97174825860543e-05, + "loss": 0.1118, + "step": 14790 + }, + { + "epoch": 0.9682695453058554, + "grad_norm": 0.9022259712219238, + "learning_rate": 9.971650661811529e-05, + "loss": 0.1073, + "step": 14800 + }, + { + "epoch": 0.9689237814851162, + "grad_norm": 0.824038028717041, + "learning_rate": 9.971552897211381e-05, + "loss": 0.1161, + "step": 14810 + }, + { + "epoch": 0.9695780176643768, + "grad_norm": 0.7811712622642517, + "learning_rate": 9.971454964808284e-05, + "loss": 0.1043, + "step": 14820 + }, + { + "epoch": 0.9702322538436375, + "grad_norm": 1.1899689435958862, + "learning_rate": 9.971356864605544e-05, + "loss": 0.1191, + "step": 14830 + }, + { + "epoch": 0.9708864900228983, + "grad_norm": 0.9993957281112671, + "learning_rate": 9.971258596606472e-05, + "loss": 0.1146, + "step": 14840 + }, + { + "epoch": 0.971540726202159, + "grad_norm": 0.8771776556968689, + "learning_rate": 9.971160160814386e-05, + "loss": 0.1151, + "step": 14850 + }, + { + "epoch": 0.9721949623814197, + "grad_norm": 0.9334481358528137, + "learning_rate": 9.971061557232606e-05, + "loss": 0.1068, + "step": 14860 + }, + { + "epoch": 0.9728491985606804, + "grad_norm": 0.9948738813400269, + "learning_rate": 9.970962785864461e-05, + "loss": 0.1052, + "step": 14870 + }, + { + "epoch": 0.9735034347399412, + "grad_norm": 0.8575040102005005, + "learning_rate": 9.970863846713286e-05, + "loss": 0.1025, + "step": 14880 + }, + { + "epoch": 0.9741576709192018, + "grad_norm": 1.019413948059082, + "learning_rate": 9.970764739782419e-05, + "loss": 0.1091, + "step": 14890 + }, + { + "epoch": 0.9748119070984625, + "grad_norm": 0.8023679852485657, + "learning_rate": 9.970665465075205e-05, + "loss": 0.1044, + "step": 14900 + }, + { + "epoch": 0.9754661432777233, + "grad_norm": 0.8333026766777039, + "learning_rate": 9.970566022594996e-05, + "loss": 0.1055, + "step": 14910 + }, + { + "epoch": 0.9761203794569839, + "grad_norm": 0.9163589477539062, + "learning_rate": 9.97046641234515e-05, + "loss": 0.0995, + "step": 14920 + }, + { + "epoch": 0.9767746156362447, + "grad_norm": 0.90887850522995, + "learning_rate": 9.970366634329024e-05, + "loss": 0.1066, + "step": 14930 + }, + { + "epoch": 0.9774288518155054, + "grad_norm": 1.0364896059036255, + "learning_rate": 9.970266688549991e-05, + "loss": 0.1136, + "step": 14940 + }, + { + "epoch": 0.9780830879947661, + "grad_norm": 1.5073118209838867, + "learning_rate": 9.970166575011422e-05, + "loss": 0.1048, + "step": 14950 + }, + { + "epoch": 0.9787373241740268, + "grad_norm": 0.8328016400337219, + "learning_rate": 9.970066293716695e-05, + "loss": 0.1149, + "step": 14960 + }, + { + "epoch": 0.9793915603532876, + "grad_norm": 0.8772666454315186, + "learning_rate": 9.969965844669197e-05, + "loss": 0.1238, + "step": 14970 + }, + { + "epoch": 0.9800457965325482, + "grad_norm": 1.083786964416504, + "learning_rate": 9.969865227872317e-05, + "loss": 0.1016, + "step": 14980 + }, + { + "epoch": 0.9807000327118089, + "grad_norm": 0.9459251165390015, + "learning_rate": 9.969764443329452e-05, + "loss": 0.1047, + "step": 14990 + }, + { + "epoch": 0.9813542688910697, + "grad_norm": 0.8524743914604187, + "learning_rate": 9.969663491044003e-05, + "loss": 0.1097, + "step": 15000 + }, + { + "epoch": 0.9820085050703304, + "grad_norm": 0.9077591896057129, + "learning_rate": 9.969562371019379e-05, + "loss": 0.1087, + "step": 15010 + }, + { + "epoch": 0.9826627412495911, + "grad_norm": 0.9452090263366699, + "learning_rate": 9.969461083258991e-05, + "loss": 0.11, + "step": 15020 + }, + { + "epoch": 0.9833169774288518, + "grad_norm": 0.9411560297012329, + "learning_rate": 9.969359627766258e-05, + "loss": 0.1065, + "step": 15030 + }, + { + "epoch": 0.9839712136081126, + "grad_norm": 0.8572501540184021, + "learning_rate": 9.969258004544606e-05, + "loss": 0.1103, + "step": 15040 + }, + { + "epoch": 0.9846254497873732, + "grad_norm": 1.1027193069458008, + "learning_rate": 9.969156213597464e-05, + "loss": 0.1198, + "step": 15050 + }, + { + "epoch": 0.985279685966634, + "grad_norm": 0.780197024345398, + "learning_rate": 9.969054254928267e-05, + "loss": 0.1102, + "step": 15060 + }, + { + "epoch": 0.9859339221458947, + "grad_norm": 0.8345792889595032, + "learning_rate": 9.968952128540456e-05, + "loss": 0.1166, + "step": 15070 + }, + { + "epoch": 0.9865881583251553, + "grad_norm": 0.8777825236320496, + "learning_rate": 9.968849834437481e-05, + "loss": 0.1066, + "step": 15080 + }, + { + "epoch": 0.9872423945044161, + "grad_norm": 0.933706521987915, + "learning_rate": 9.968747372622793e-05, + "loss": 0.1241, + "step": 15090 + }, + { + "epoch": 0.9878966306836768, + "grad_norm": 0.7910727858543396, + "learning_rate": 9.968644743099848e-05, + "loss": 0.1132, + "step": 15100 + }, + { + "epoch": 0.9885508668629375, + "grad_norm": 0.7466284036636353, + "learning_rate": 9.968541945872114e-05, + "loss": 0.1079, + "step": 15110 + }, + { + "epoch": 0.9892051030421982, + "grad_norm": 0.8247948884963989, + "learning_rate": 9.968438980943057e-05, + "loss": 0.1025, + "step": 15120 + }, + { + "epoch": 0.989859339221459, + "grad_norm": 0.8794564008712769, + "learning_rate": 9.968335848316157e-05, + "loss": 0.1155, + "step": 15130 + }, + { + "epoch": 0.9905135754007197, + "grad_norm": 0.794762909412384, + "learning_rate": 9.968232547994891e-05, + "loss": 0.1075, + "step": 15140 + }, + { + "epoch": 0.9911678115799804, + "grad_norm": 0.8396781086921692, + "learning_rate": 9.968129079982747e-05, + "loss": 0.1137, + "step": 15150 + }, + { + "epoch": 0.9918220477592411, + "grad_norm": 0.7761490941047668, + "learning_rate": 9.968025444283215e-05, + "loss": 0.1065, + "step": 15160 + }, + { + "epoch": 0.9924762839385018, + "grad_norm": 1.025817632675171, + "learning_rate": 9.967921640899797e-05, + "loss": 0.1023, + "step": 15170 + }, + { + "epoch": 0.9931305201177625, + "grad_norm": 0.8451741337776184, + "learning_rate": 9.967817669835995e-05, + "loss": 0.1135, + "step": 15180 + }, + { + "epoch": 0.9937847562970232, + "grad_norm": 0.7876104712486267, + "learning_rate": 9.967713531095317e-05, + "loss": 0.1145, + "step": 15190 + }, + { + "epoch": 0.994438992476284, + "grad_norm": 1.0193179845809937, + "learning_rate": 9.967609224681281e-05, + "loss": 0.1023, + "step": 15200 + }, + { + "epoch": 0.9950932286555446, + "grad_norm": 0.8599629998207092, + "learning_rate": 9.967504750597405e-05, + "loss": 0.1134, + "step": 15210 + }, + { + "epoch": 0.9957474648348054, + "grad_norm": 0.9450410604476929, + "learning_rate": 9.967400108847213e-05, + "loss": 0.1128, + "step": 15220 + }, + { + "epoch": 0.9964017010140661, + "grad_norm": 0.841467022895813, + "learning_rate": 9.967295299434243e-05, + "loss": 0.1072, + "step": 15230 + }, + { + "epoch": 0.9970559371933267, + "grad_norm": 0.7316820621490479, + "learning_rate": 9.967190322362029e-05, + "loss": 0.1059, + "step": 15240 + }, + { + "epoch": 0.9977101733725875, + "grad_norm": 0.8536782264709473, + "learning_rate": 9.967085177634115e-05, + "loss": 0.1026, + "step": 15250 + }, + { + "epoch": 0.9983644095518482, + "grad_norm": 0.8215930461883545, + "learning_rate": 9.966979865254047e-05, + "loss": 0.1131, + "step": 15260 + }, + { + "epoch": 0.9990186457311089, + "grad_norm": 1.0290292501449585, + "learning_rate": 9.966874385225385e-05, + "loss": 0.1151, + "step": 15270 + }, + { + "epoch": 0.9996728819103696, + "grad_norm": 1.0046546459197998, + "learning_rate": 9.966768737551685e-05, + "loss": 0.1078, + "step": 15280 + }, + { + "epoch": 1.0003271180896303, + "grad_norm": 1.0391733646392822, + "learning_rate": 9.966662922236515e-05, + "loss": 0.1175, + "step": 15290 + }, + { + "epoch": 1.000981354268891, + "grad_norm": 0.8953946828842163, + "learning_rate": 9.966556939283445e-05, + "loss": 0.1119, + "step": 15300 + }, + { + "epoch": 1.0016355904481518, + "grad_norm": 0.8209472298622131, + "learning_rate": 9.966450788696053e-05, + "loss": 0.1263, + "step": 15310 + }, + { + "epoch": 1.0022898266274125, + "grad_norm": 0.9228883981704712, + "learning_rate": 9.966344470477922e-05, + "loss": 0.1051, + "step": 15320 + }, + { + "epoch": 1.0029440628066733, + "grad_norm": 0.8755295276641846, + "learning_rate": 9.966237984632641e-05, + "loss": 0.105, + "step": 15330 + }, + { + "epoch": 1.003598298985934, + "grad_norm": 0.727872371673584, + "learning_rate": 9.966131331163803e-05, + "loss": 0.1011, + "step": 15340 + }, + { + "epoch": 1.0042525351651945, + "grad_norm": 0.8583409190177917, + "learning_rate": 9.96602451007501e-05, + "loss": 0.1096, + "step": 15350 + }, + { + "epoch": 1.0049067713444553, + "grad_norm": 0.971300482749939, + "learning_rate": 9.965917521369865e-05, + "loss": 0.1031, + "step": 15360 + }, + { + "epoch": 1.005561007523716, + "grad_norm": 0.8533384799957275, + "learning_rate": 9.96581036505198e-05, + "loss": 0.1199, + "step": 15370 + }, + { + "epoch": 1.0062152437029768, + "grad_norm": 0.8822680711746216, + "learning_rate": 9.96570304112497e-05, + "loss": 0.1087, + "step": 15380 + }, + { + "epoch": 1.0068694798822375, + "grad_norm": 0.714950442314148, + "learning_rate": 9.965595549592462e-05, + "loss": 0.112, + "step": 15390 + }, + { + "epoch": 1.0075237160614983, + "grad_norm": 0.9944178462028503, + "learning_rate": 9.96548789045808e-05, + "loss": 0.1072, + "step": 15400 + }, + { + "epoch": 1.008177952240759, + "grad_norm": 0.8967353105545044, + "learning_rate": 9.96538006372546e-05, + "loss": 0.1119, + "step": 15410 + }, + { + "epoch": 1.0088321884200195, + "grad_norm": 0.7786975502967834, + "learning_rate": 9.96527206939824e-05, + "loss": 0.1114, + "step": 15420 + }, + { + "epoch": 1.0094864245992803, + "grad_norm": 0.8993393182754517, + "learning_rate": 9.965163907480066e-05, + "loss": 0.1047, + "step": 15430 + }, + { + "epoch": 1.010140660778541, + "grad_norm": 0.8233774304389954, + "learning_rate": 9.965055577974588e-05, + "loss": 0.1033, + "step": 15440 + }, + { + "epoch": 1.0107948969578018, + "grad_norm": 0.7405421733856201, + "learning_rate": 9.964947080885464e-05, + "loss": 0.1214, + "step": 15450 + }, + { + "epoch": 1.0114491331370625, + "grad_norm": 0.7713890075683594, + "learning_rate": 9.964838416216354e-05, + "loss": 0.1069, + "step": 15460 + }, + { + "epoch": 1.0121033693163233, + "grad_norm": 0.7736397981643677, + "learning_rate": 9.964729583970927e-05, + "loss": 0.1251, + "step": 15470 + }, + { + "epoch": 1.0127576054955838, + "grad_norm": 0.7739368081092834, + "learning_rate": 9.964620584152857e-05, + "loss": 0.105, + "step": 15480 + }, + { + "epoch": 1.0134118416748445, + "grad_norm": 0.8225721120834351, + "learning_rate": 9.964511416765821e-05, + "loss": 0.1196, + "step": 15490 + }, + { + "epoch": 1.0140660778541053, + "grad_norm": 0.7423897981643677, + "learning_rate": 9.964402081813504e-05, + "loss": 0.1159, + "step": 15500 + }, + { + "epoch": 1.014720314033366, + "grad_norm": 0.8205366730690002, + "learning_rate": 9.9642925792996e-05, + "loss": 0.1092, + "step": 15510 + }, + { + "epoch": 1.0153745502126268, + "grad_norm": 0.9027040600776672, + "learning_rate": 9.964182909227799e-05, + "loss": 0.1122, + "step": 15520 + }, + { + "epoch": 1.0160287863918875, + "grad_norm": 0.7824887037277222, + "learning_rate": 9.964073071601808e-05, + "loss": 0.1046, + "step": 15530 + }, + { + "epoch": 1.0166830225711483, + "grad_norm": 0.9918166995048523, + "learning_rate": 9.963963066425331e-05, + "loss": 0.1141, + "step": 15540 + }, + { + "epoch": 1.0173372587504088, + "grad_norm": 0.9670895934104919, + "learning_rate": 9.963852893702081e-05, + "loss": 0.1055, + "step": 15550 + }, + { + "epoch": 1.0179914949296696, + "grad_norm": 0.8161757588386536, + "learning_rate": 9.96374255343578e-05, + "loss": 0.109, + "step": 15560 + }, + { + "epoch": 1.0186457311089303, + "grad_norm": 0.9068933129310608, + "learning_rate": 9.963632045630147e-05, + "loss": 0.1047, + "step": 15570 + }, + { + "epoch": 1.019299967288191, + "grad_norm": 0.902734100818634, + "learning_rate": 9.963521370288917e-05, + "loss": 0.1019, + "step": 15580 + }, + { + "epoch": 1.0199542034674518, + "grad_norm": 0.9012474417686462, + "learning_rate": 9.963410527415823e-05, + "loss": 0.1052, + "step": 15590 + }, + { + "epoch": 1.0206084396467126, + "grad_norm": 0.7533280253410339, + "learning_rate": 9.963299517014608e-05, + "loss": 0.1101, + "step": 15600 + }, + { + "epoch": 1.021262675825973, + "grad_norm": 0.8915255069732666, + "learning_rate": 9.963188339089015e-05, + "loss": 0.0986, + "step": 15610 + }, + { + "epoch": 1.0219169120052338, + "grad_norm": 0.8015055060386658, + "learning_rate": 9.963076993642802e-05, + "loss": 0.1187, + "step": 15620 + }, + { + "epoch": 1.0225711481844946, + "grad_norm": 0.9138506054878235, + "learning_rate": 9.962965480679721e-05, + "loss": 0.106, + "step": 15630 + }, + { + "epoch": 1.0232253843637553, + "grad_norm": 0.8131011724472046, + "learning_rate": 9.962853800203541e-05, + "loss": 0.1278, + "step": 15640 + }, + { + "epoch": 1.023879620543016, + "grad_norm": 0.8767716288566589, + "learning_rate": 9.96274195221803e-05, + "loss": 0.1054, + "step": 15650 + }, + { + "epoch": 1.0245338567222768, + "grad_norm": 0.8304327726364136, + "learning_rate": 9.962629936726962e-05, + "loss": 0.0943, + "step": 15660 + }, + { + "epoch": 1.0251880929015376, + "grad_norm": 0.8598764538764954, + "learning_rate": 9.96251775373412e-05, + "loss": 0.108, + "step": 15670 + }, + { + "epoch": 1.025842329080798, + "grad_norm": 0.8592802286148071, + "learning_rate": 9.962405403243287e-05, + "loss": 0.1186, + "step": 15680 + }, + { + "epoch": 1.0264965652600588, + "grad_norm": 0.9450241327285767, + "learning_rate": 9.962292885258259e-05, + "loss": 0.1196, + "step": 15690 + }, + { + "epoch": 1.0271508014393196, + "grad_norm": 0.8318309187889099, + "learning_rate": 9.962180199782831e-05, + "loss": 0.1173, + "step": 15700 + }, + { + "epoch": 1.0278050376185803, + "grad_norm": 0.8451339602470398, + "learning_rate": 9.962067346820808e-05, + "loss": 0.119, + "step": 15710 + }, + { + "epoch": 1.028459273797841, + "grad_norm": 0.8618002533912659, + "learning_rate": 9.961954326375998e-05, + "loss": 0.1103, + "step": 15720 + }, + { + "epoch": 1.0291135099771018, + "grad_norm": 0.7853378653526306, + "learning_rate": 9.961841138452217e-05, + "loss": 0.1144, + "step": 15730 + }, + { + "epoch": 1.0297677461563624, + "grad_norm": 0.8953865766525269, + "learning_rate": 9.961727783053285e-05, + "loss": 0.1148, + "step": 15740 + }, + { + "epoch": 1.030421982335623, + "grad_norm": 0.9555432200431824, + "learning_rate": 9.961614260183028e-05, + "loss": 0.1188, + "step": 15750 + }, + { + "epoch": 1.0310762185148838, + "grad_norm": 0.9691207408905029, + "learning_rate": 9.961500569845275e-05, + "loss": 0.1037, + "step": 15760 + }, + { + "epoch": 1.0317304546941446, + "grad_norm": 0.9222931265830994, + "learning_rate": 9.961386712043868e-05, + "loss": 0.1007, + "step": 15770 + }, + { + "epoch": 1.0323846908734053, + "grad_norm": 0.838117241859436, + "learning_rate": 9.961272686782646e-05, + "loss": 0.1034, + "step": 15780 + }, + { + "epoch": 1.033038927052666, + "grad_norm": 1.437838077545166, + "learning_rate": 9.961158494065461e-05, + "loss": 0.1285, + "step": 15790 + }, + { + "epoch": 1.0336931632319266, + "grad_norm": 0.8757314085960388, + "learning_rate": 9.961044133896166e-05, + "loss": 0.1098, + "step": 15800 + }, + { + "epoch": 1.0343473994111874, + "grad_norm": 0.8826759457588196, + "learning_rate": 9.96092960627862e-05, + "loss": 0.1102, + "step": 15810 + }, + { + "epoch": 1.035001635590448, + "grad_norm": 0.820978581905365, + "learning_rate": 9.96081491121669e-05, + "loss": 0.1053, + "step": 15820 + }, + { + "epoch": 1.0356558717697089, + "grad_norm": 0.8541067242622375, + "learning_rate": 9.960700048714244e-05, + "loss": 0.1115, + "step": 15830 + }, + { + "epoch": 1.0363101079489696, + "grad_norm": 0.7648676633834839, + "learning_rate": 9.960585018775164e-05, + "loss": 0.1015, + "step": 15840 + }, + { + "epoch": 1.0369643441282304, + "grad_norm": 0.8757931590080261, + "learning_rate": 9.960469821403329e-05, + "loss": 0.1006, + "step": 15850 + }, + { + "epoch": 1.037618580307491, + "grad_norm": 0.9553504586219788, + "learning_rate": 9.960354456602628e-05, + "loss": 0.1076, + "step": 15860 + }, + { + "epoch": 1.0382728164867516, + "grad_norm": 1.0022895336151123, + "learning_rate": 9.960238924376954e-05, + "loss": 0.1076, + "step": 15870 + }, + { + "epoch": 1.0389270526660124, + "grad_norm": 0.8987383842468262, + "learning_rate": 9.96012322473021e-05, + "loss": 0.112, + "step": 15880 + }, + { + "epoch": 1.0395812888452731, + "grad_norm": 0.8711419105529785, + "learning_rate": 9.960007357666297e-05, + "loss": 0.1046, + "step": 15890 + }, + { + "epoch": 1.0402355250245339, + "grad_norm": 0.6669361591339111, + "learning_rate": 9.95989132318913e-05, + "loss": 0.1021, + "step": 15900 + }, + { + "epoch": 1.0408897612037946, + "grad_norm": 0.7434557676315308, + "learning_rate": 9.959775121302621e-05, + "loss": 0.1104, + "step": 15910 + }, + { + "epoch": 1.0415439973830554, + "grad_norm": 0.8554294109344482, + "learning_rate": 9.959658752010695e-05, + "loss": 0.104, + "step": 15920 + }, + { + "epoch": 1.042198233562316, + "grad_norm": 0.9452110528945923, + "learning_rate": 9.959542215317278e-05, + "loss": 0.1057, + "step": 15930 + }, + { + "epoch": 1.0428524697415766, + "grad_norm": 0.9260740876197815, + "learning_rate": 9.959425511226304e-05, + "loss": 0.1065, + "step": 15940 + }, + { + "epoch": 1.0435067059208374, + "grad_norm": 0.939108669757843, + "learning_rate": 9.959308639741714e-05, + "loss": 0.1098, + "step": 15950 + }, + { + "epoch": 1.0441609421000981, + "grad_norm": 0.7533574104309082, + "learning_rate": 9.95919160086745e-05, + "loss": 0.11, + "step": 15960 + }, + { + "epoch": 1.0448151782793589, + "grad_norm": 0.7909335494041443, + "learning_rate": 9.959074394607464e-05, + "loss": 0.1032, + "step": 15970 + }, + { + "epoch": 1.0454694144586196, + "grad_norm": 0.6894845366477966, + "learning_rate": 9.958957020965712e-05, + "loss": 0.0998, + "step": 15980 + }, + { + "epoch": 1.0461236506378804, + "grad_norm": 0.8080494403839111, + "learning_rate": 9.958839479946154e-05, + "loss": 0.1073, + "step": 15990 + }, + { + "epoch": 1.046777886817141, + "grad_norm": 0.8611446022987366, + "learning_rate": 9.958721771552759e-05, + "loss": 0.1055, + "step": 16000 + }, + { + "epoch": 1.0474321229964016, + "grad_norm": 0.9711137413978577, + "learning_rate": 9.958603895789501e-05, + "loss": 0.1015, + "step": 16010 + }, + { + "epoch": 1.0480863591756624, + "grad_norm": 0.9332404732704163, + "learning_rate": 9.958485852660356e-05, + "loss": 0.1092, + "step": 16020 + }, + { + "epoch": 1.0487405953549231, + "grad_norm": 0.7202958464622498, + "learning_rate": 9.958367642169308e-05, + "loss": 0.0976, + "step": 16030 + }, + { + "epoch": 1.049394831534184, + "grad_norm": 0.9648202657699585, + "learning_rate": 9.958249264320349e-05, + "loss": 0.1103, + "step": 16040 + }, + { + "epoch": 1.0500490677134446, + "grad_norm": 0.8752630949020386, + "learning_rate": 9.958130719117475e-05, + "loss": 0.1095, + "step": 16050 + }, + { + "epoch": 1.0507033038927052, + "grad_norm": 1.0610864162445068, + "learning_rate": 9.958012006564686e-05, + "loss": 0.1026, + "step": 16060 + }, + { + "epoch": 1.051357540071966, + "grad_norm": 0.9164272546768188, + "learning_rate": 9.957893126665987e-05, + "loss": 0.112, + "step": 16070 + }, + { + "epoch": 1.0520117762512267, + "grad_norm": 0.8324456214904785, + "learning_rate": 9.957774079425395e-05, + "loss": 0.1134, + "step": 16080 + }, + { + "epoch": 1.0526660124304874, + "grad_norm": 1.0304049253463745, + "learning_rate": 9.957654864846924e-05, + "loss": 0.101, + "step": 16090 + }, + { + "epoch": 1.0533202486097482, + "grad_norm": 0.9455142021179199, + "learning_rate": 9.9575354829346e-05, + "loss": 0.1024, + "step": 16100 + }, + { + "epoch": 1.053974484789009, + "grad_norm": 1.337161660194397, + "learning_rate": 9.95741593369245e-05, + "loss": 0.1067, + "step": 16110 + }, + { + "epoch": 1.0546287209682697, + "grad_norm": 0.9535005688667297, + "learning_rate": 9.957296217124513e-05, + "loss": 0.1078, + "step": 16120 + }, + { + "epoch": 1.0552829571475302, + "grad_norm": 0.849690854549408, + "learning_rate": 9.957176333234828e-05, + "loss": 0.1083, + "step": 16130 + }, + { + "epoch": 1.055937193326791, + "grad_norm": 0.8453637361526489, + "learning_rate": 9.957056282027439e-05, + "loss": 0.105, + "step": 16140 + }, + { + "epoch": 1.0565914295060517, + "grad_norm": 0.9851493835449219, + "learning_rate": 9.956936063506402e-05, + "loss": 0.1029, + "step": 16150 + }, + { + "epoch": 1.0572456656853124, + "grad_norm": 0.8924856781959534, + "learning_rate": 9.956815677675772e-05, + "loss": 0.1031, + "step": 16160 + }, + { + "epoch": 1.0578999018645732, + "grad_norm": 1.1166882514953613, + "learning_rate": 9.956695124539613e-05, + "loss": 0.1058, + "step": 16170 + }, + { + "epoch": 1.058554138043834, + "grad_norm": 0.7543265223503113, + "learning_rate": 9.956574404101994e-05, + "loss": 0.0973, + "step": 16180 + }, + { + "epoch": 1.0592083742230944, + "grad_norm": 0.7494108080863953, + "learning_rate": 9.95645351636699e-05, + "loss": 0.104, + "step": 16190 + }, + { + "epoch": 1.0598626104023552, + "grad_norm": 0.9787102937698364, + "learning_rate": 9.956332461338683e-05, + "loss": 0.1027, + "step": 16200 + }, + { + "epoch": 1.060516846581616, + "grad_norm": 0.957390546798706, + "learning_rate": 9.956211239021154e-05, + "loss": 0.1086, + "step": 16210 + }, + { + "epoch": 1.0611710827608767, + "grad_norm": 1.1353367567062378, + "learning_rate": 9.9560898494185e-05, + "loss": 0.1035, + "step": 16220 + }, + { + "epoch": 1.0618253189401374, + "grad_norm": 1.0974490642547607, + "learning_rate": 9.955968292534814e-05, + "loss": 0.1087, + "step": 16230 + }, + { + "epoch": 1.0624795551193982, + "grad_norm": 1.101291537284851, + "learning_rate": 9.955846568374201e-05, + "loss": 0.1154, + "step": 16240 + }, + { + "epoch": 1.0631337912986587, + "grad_norm": 0.8823967576026917, + "learning_rate": 9.955724676940769e-05, + "loss": 0.1043, + "step": 16250 + }, + { + "epoch": 1.0637880274779195, + "grad_norm": 0.7924269437789917, + "learning_rate": 9.955602618238633e-05, + "loss": 0.1018, + "step": 16260 + }, + { + "epoch": 1.0644422636571802, + "grad_norm": 1.0062264204025269, + "learning_rate": 9.955480392271911e-05, + "loss": 0.1101, + "step": 16270 + }, + { + "epoch": 1.065096499836441, + "grad_norm": 0.8188719749450684, + "learning_rate": 9.95535799904473e-05, + "loss": 0.0963, + "step": 16280 + }, + { + "epoch": 1.0657507360157017, + "grad_norm": 1.1326643228530884, + "learning_rate": 9.955235438561222e-05, + "loss": 0.1052, + "step": 16290 + }, + { + "epoch": 1.0664049721949624, + "grad_norm": 0.7720603942871094, + "learning_rate": 9.95511271082552e-05, + "loss": 0.1034, + "step": 16300 + }, + { + "epoch": 1.0670592083742232, + "grad_norm": 0.8074955344200134, + "learning_rate": 9.954989815841771e-05, + "loss": 0.1159, + "step": 16310 + }, + { + "epoch": 1.0677134445534837, + "grad_norm": 0.7520632147789001, + "learning_rate": 9.954866753614118e-05, + "loss": 0.1035, + "step": 16320 + }, + { + "epoch": 1.0683676807327445, + "grad_norm": 0.8165426254272461, + "learning_rate": 9.95474352414672e-05, + "loss": 0.1079, + "step": 16330 + }, + { + "epoch": 1.0690219169120052, + "grad_norm": 0.8366930484771729, + "learning_rate": 9.954620127443733e-05, + "loss": 0.1116, + "step": 16340 + }, + { + "epoch": 1.069676153091266, + "grad_norm": 0.7152531147003174, + "learning_rate": 9.954496563509323e-05, + "loss": 0.0996, + "step": 16350 + }, + { + "epoch": 1.0703303892705267, + "grad_norm": 0.9950698614120483, + "learning_rate": 9.954372832347661e-05, + "loss": 0.1074, + "step": 16360 + }, + { + "epoch": 1.0709846254497875, + "grad_norm": 0.9378706812858582, + "learning_rate": 9.954248933962919e-05, + "loss": 0.1058, + "step": 16370 + }, + { + "epoch": 1.0716388616290482, + "grad_norm": 0.8824446201324463, + "learning_rate": 9.954124868359287e-05, + "loss": 0.1022, + "step": 16380 + }, + { + "epoch": 1.0722930978083087, + "grad_norm": 0.8328707218170166, + "learning_rate": 9.954000635540946e-05, + "loss": 0.1081, + "step": 16390 + }, + { + "epoch": 1.0729473339875695, + "grad_norm": 0.9733028411865234, + "learning_rate": 9.953876235512091e-05, + "loss": 0.1012, + "step": 16400 + }, + { + "epoch": 1.0736015701668302, + "grad_norm": 0.8833745718002319, + "learning_rate": 9.953751668276921e-05, + "loss": 0.1173, + "step": 16410 + }, + { + "epoch": 1.074255806346091, + "grad_norm": 0.9558156728744507, + "learning_rate": 9.953626933839641e-05, + "loss": 0.1014, + "step": 16420 + }, + { + "epoch": 1.0749100425253517, + "grad_norm": 0.9810806512832642, + "learning_rate": 9.95350203220446e-05, + "loss": 0.1074, + "step": 16430 + }, + { + "epoch": 1.0755642787046125, + "grad_norm": 0.810119092464447, + "learning_rate": 9.953376963375596e-05, + "loss": 0.1046, + "step": 16440 + }, + { + "epoch": 1.076218514883873, + "grad_norm": 0.9398297071456909, + "learning_rate": 9.953251727357267e-05, + "loss": 0.1014, + "step": 16450 + }, + { + "epoch": 1.0768727510631337, + "grad_norm": 0.7809659838676453, + "learning_rate": 9.953126324153701e-05, + "loss": 0.1171, + "step": 16460 + }, + { + "epoch": 1.0775269872423945, + "grad_norm": 1.0472731590270996, + "learning_rate": 9.953000753769135e-05, + "loss": 0.1062, + "step": 16470 + }, + { + "epoch": 1.0781812234216552, + "grad_norm": 1.0119749307632446, + "learning_rate": 9.9528750162078e-05, + "loss": 0.0979, + "step": 16480 + }, + { + "epoch": 1.078835459600916, + "grad_norm": 0.8965128064155579, + "learning_rate": 9.952749111473946e-05, + "loss": 0.1093, + "step": 16490 + }, + { + "epoch": 1.0794896957801767, + "grad_norm": 0.9507614970207214, + "learning_rate": 9.95262303957182e-05, + "loss": 0.0978, + "step": 16500 + }, + { + "epoch": 1.0801439319594373, + "grad_norm": 0.728331446647644, + "learning_rate": 9.952496800505679e-05, + "loss": 0.1016, + "step": 16510 + }, + { + "epoch": 1.080798168138698, + "grad_norm": 0.896195113658905, + "learning_rate": 9.952370394279781e-05, + "loss": 0.1106, + "step": 16520 + }, + { + "epoch": 1.0814524043179587, + "grad_norm": 0.9140453338623047, + "learning_rate": 9.952243820898395e-05, + "loss": 0.1173, + "step": 16530 + }, + { + "epoch": 1.0821066404972195, + "grad_norm": 0.798197329044342, + "learning_rate": 9.95211708036579e-05, + "loss": 0.095, + "step": 16540 + }, + { + "epoch": 1.0827608766764802, + "grad_norm": 0.7951470613479614, + "learning_rate": 9.951990172686248e-05, + "loss": 0.1061, + "step": 16550 + }, + { + "epoch": 1.083415112855741, + "grad_norm": 0.809076189994812, + "learning_rate": 9.951863097864052e-05, + "loss": 0.0965, + "step": 16560 + }, + { + "epoch": 1.0840693490350017, + "grad_norm": 0.8439357876777649, + "learning_rate": 9.951735855903488e-05, + "loss": 0.1074, + "step": 16570 + }, + { + "epoch": 1.0847235852142623, + "grad_norm": 1.051507830619812, + "learning_rate": 9.951608446808852e-05, + "loss": 0.1089, + "step": 16580 + }, + { + "epoch": 1.085377821393523, + "grad_norm": 0.9256449937820435, + "learning_rate": 9.951480870584445e-05, + "loss": 0.1052, + "step": 16590 + }, + { + "epoch": 1.0860320575727838, + "grad_norm": 0.7667525410652161, + "learning_rate": 9.951353127234574e-05, + "loss": 0.1128, + "step": 16600 + }, + { + "epoch": 1.0866862937520445, + "grad_norm": 0.821887731552124, + "learning_rate": 9.951225216763549e-05, + "loss": 0.1031, + "step": 16610 + }, + { + "epoch": 1.0873405299313053, + "grad_norm": 0.8057878017425537, + "learning_rate": 9.951097139175687e-05, + "loss": 0.1135, + "step": 16620 + }, + { + "epoch": 1.087994766110566, + "grad_norm": 0.9914917349815369, + "learning_rate": 9.950968894475313e-05, + "loss": 0.0971, + "step": 16630 + }, + { + "epoch": 1.0886490022898265, + "grad_norm": 0.8095641136169434, + "learning_rate": 9.950840482666755e-05, + "loss": 0.1118, + "step": 16640 + }, + { + "epoch": 1.0893032384690873, + "grad_norm": 0.8370475172996521, + "learning_rate": 9.950711903754345e-05, + "loss": 0.1098, + "step": 16650 + }, + { + "epoch": 1.089957474648348, + "grad_norm": 0.8619472980499268, + "learning_rate": 9.950583157742426e-05, + "loss": 0.1036, + "step": 16660 + }, + { + "epoch": 1.0906117108276088, + "grad_norm": 0.9595767855644226, + "learning_rate": 9.950454244635341e-05, + "loss": 0.1139, + "step": 16670 + }, + { + "epoch": 1.0912659470068695, + "grad_norm": 1.2058511972427368, + "learning_rate": 9.950325164437442e-05, + "loss": 0.1025, + "step": 16680 + }, + { + "epoch": 1.0919201831861303, + "grad_norm": 0.9937551617622375, + "learning_rate": 9.950195917153086e-05, + "loss": 0.1148, + "step": 16690 + }, + { + "epoch": 1.0925744193653908, + "grad_norm": 0.8375882506370544, + "learning_rate": 9.950066502786637e-05, + "loss": 0.0937, + "step": 16700 + }, + { + "epoch": 1.0932286555446515, + "grad_norm": 0.9841185212135315, + "learning_rate": 9.94993692134246e-05, + "loss": 0.1113, + "step": 16710 + }, + { + "epoch": 1.0938828917239123, + "grad_norm": 1.1531639099121094, + "learning_rate": 9.949807172824929e-05, + "loss": 0.1074, + "step": 16720 + }, + { + "epoch": 1.094537127903173, + "grad_norm": 0.8746184706687927, + "learning_rate": 9.949677257238428e-05, + "loss": 0.1053, + "step": 16730 + }, + { + "epoch": 1.0951913640824338, + "grad_norm": 1.0337003469467163, + "learning_rate": 9.949547174587337e-05, + "loss": 0.1127, + "step": 16740 + }, + { + "epoch": 1.0958456002616945, + "grad_norm": 0.9409461617469788, + "learning_rate": 9.949416924876047e-05, + "loss": 0.1085, + "step": 16750 + }, + { + "epoch": 1.0964998364409553, + "grad_norm": 0.7191810011863708, + "learning_rate": 9.949286508108957e-05, + "loss": 0.096, + "step": 16760 + }, + { + "epoch": 1.0971540726202158, + "grad_norm": 0.7731035351753235, + "learning_rate": 9.949155924290466e-05, + "loss": 0.102, + "step": 16770 + }, + { + "epoch": 1.0978083087994766, + "grad_norm": 0.9170941710472107, + "learning_rate": 9.949025173424984e-05, + "loss": 0.1042, + "step": 16780 + }, + { + "epoch": 1.0984625449787373, + "grad_norm": 0.8782985210418701, + "learning_rate": 9.948894255516923e-05, + "loss": 0.099, + "step": 16790 + }, + { + "epoch": 1.099116781157998, + "grad_norm": 0.9650111794471741, + "learning_rate": 9.948763170570702e-05, + "loss": 0.1106, + "step": 16800 + }, + { + "epoch": 1.0997710173372588, + "grad_norm": 1.0858937501907349, + "learning_rate": 9.948631918590746e-05, + "loss": 0.1097, + "step": 16810 + }, + { + "epoch": 1.1004252535165195, + "grad_norm": 0.8748927712440491, + "learning_rate": 9.948500499581484e-05, + "loss": 0.1124, + "step": 16820 + }, + { + "epoch": 1.1010794896957803, + "grad_norm": 0.9151651859283447, + "learning_rate": 9.94836891354735e-05, + "loss": 0.1082, + "step": 16830 + }, + { + "epoch": 1.1017337258750408, + "grad_norm": 0.9073200225830078, + "learning_rate": 9.948237160492791e-05, + "loss": 0.1106, + "step": 16840 + }, + { + "epoch": 1.1023879620543016, + "grad_norm": 0.8182057738304138, + "learning_rate": 9.948105240422247e-05, + "loss": 0.1098, + "step": 16850 + }, + { + "epoch": 1.1030421982335623, + "grad_norm": 0.7684637904167175, + "learning_rate": 9.947973153340178e-05, + "loss": 0.1042, + "step": 16860 + }, + { + "epoch": 1.103696434412823, + "grad_norm": 0.8614751100540161, + "learning_rate": 9.947840899251036e-05, + "loss": 0.0928, + "step": 16870 + }, + { + "epoch": 1.1043506705920838, + "grad_norm": 1.0662237405776978, + "learning_rate": 9.947708478159288e-05, + "loss": 0.1048, + "step": 16880 + }, + { + "epoch": 1.1050049067713446, + "grad_norm": 0.8045850396156311, + "learning_rate": 9.947575890069404e-05, + "loss": 0.1133, + "step": 16890 + }, + { + "epoch": 1.105659142950605, + "grad_norm": 0.9639503955841064, + "learning_rate": 9.947443134985857e-05, + "loss": 0.1188, + "step": 16900 + }, + { + "epoch": 1.1063133791298658, + "grad_norm": 0.8630536794662476, + "learning_rate": 9.94731021291313e-05, + "loss": 0.0985, + "step": 16910 + }, + { + "epoch": 1.1069676153091266, + "grad_norm": 0.8716172575950623, + "learning_rate": 9.947177123855708e-05, + "loss": 0.1138, + "step": 16920 + }, + { + "epoch": 1.1076218514883873, + "grad_norm": 0.8983464241027832, + "learning_rate": 9.947043867818084e-05, + "loss": 0.1056, + "step": 16930 + }, + { + "epoch": 1.108276087667648, + "grad_norm": 0.889821469783783, + "learning_rate": 9.946910444804755e-05, + "loss": 0.1008, + "step": 16940 + }, + { + "epoch": 1.1089303238469088, + "grad_norm": 0.8643634915351868, + "learning_rate": 9.946776854820224e-05, + "loss": 0.1147, + "step": 16950 + }, + { + "epoch": 1.1095845600261693, + "grad_norm": 0.8867340683937073, + "learning_rate": 9.946643097869002e-05, + "loss": 0.096, + "step": 16960 + }, + { + "epoch": 1.11023879620543, + "grad_norm": 0.859140932559967, + "learning_rate": 9.946509173955603e-05, + "loss": 0.1037, + "step": 16970 + }, + { + "epoch": 1.1108930323846908, + "grad_norm": 1.0756253004074097, + "learning_rate": 9.946375083084545e-05, + "loss": 0.11, + "step": 16980 + }, + { + "epoch": 1.1115472685639516, + "grad_norm": 0.7562744617462158, + "learning_rate": 9.946240825260356e-05, + "loss": 0.1174, + "step": 16990 + }, + { + "epoch": 1.1122015047432123, + "grad_norm": 0.8000214099884033, + "learning_rate": 9.946106400487567e-05, + "loss": 0.1084, + "step": 17000 + }, + { + "epoch": 1.112855740922473, + "grad_norm": 0.9482758641242981, + "learning_rate": 9.945971808770716e-05, + "loss": 0.1032, + "step": 17010 + }, + { + "epoch": 1.1135099771017338, + "grad_norm": 0.8377864360809326, + "learning_rate": 9.945837050114345e-05, + "loss": 0.0981, + "step": 17020 + }, + { + "epoch": 1.1141642132809944, + "grad_norm": 0.9127662181854248, + "learning_rate": 9.945702124523002e-05, + "loss": 0.1082, + "step": 17030 + }, + { + "epoch": 1.114818449460255, + "grad_norm": 1.0239959955215454, + "learning_rate": 9.945567032001243e-05, + "loss": 0.1188, + "step": 17040 + }, + { + "epoch": 1.1154726856395158, + "grad_norm": 1.2458797693252563, + "learning_rate": 9.945431772553626e-05, + "loss": 0.122, + "step": 17050 + }, + { + "epoch": 1.1161269218187766, + "grad_norm": 1.0552146434783936, + "learning_rate": 9.945296346184716e-05, + "loss": 0.1069, + "step": 17060 + }, + { + "epoch": 1.1167811579980373, + "grad_norm": 0.7637792825698853, + "learning_rate": 9.945160752899085e-05, + "loss": 0.0978, + "step": 17070 + }, + { + "epoch": 1.117435394177298, + "grad_norm": 1.0093464851379395, + "learning_rate": 9.94502499270131e-05, + "loss": 0.121, + "step": 17080 + }, + { + "epoch": 1.1180896303565586, + "grad_norm": 0.7700073719024658, + "learning_rate": 9.944889065595972e-05, + "loss": 0.1018, + "step": 17090 + }, + { + "epoch": 1.1187438665358194, + "grad_norm": 0.8178618550300598, + "learning_rate": 9.94475297158766e-05, + "loss": 0.096, + "step": 17100 + }, + { + "epoch": 1.1193981027150801, + "grad_norm": 0.8623060584068298, + "learning_rate": 9.944616710680967e-05, + "loss": 0.1043, + "step": 17110 + }, + { + "epoch": 1.1200523388943409, + "grad_norm": 0.7678028345108032, + "learning_rate": 9.944480282880493e-05, + "loss": 0.1027, + "step": 17120 + }, + { + "epoch": 1.1207065750736016, + "grad_norm": 0.92671138048172, + "learning_rate": 9.944343688190842e-05, + "loss": 0.0993, + "step": 17130 + }, + { + "epoch": 1.1213608112528624, + "grad_norm": 0.9388012290000916, + "learning_rate": 9.944206926616624e-05, + "loss": 0.1094, + "step": 17140 + }, + { + "epoch": 1.1220150474321229, + "grad_norm": 0.7402985095977783, + "learning_rate": 9.944069998162455e-05, + "loss": 0.1004, + "step": 17150 + }, + { + "epoch": 1.1226692836113836, + "grad_norm": 0.8221508860588074, + "learning_rate": 9.943932902832959e-05, + "loss": 0.1123, + "step": 17160 + }, + { + "epoch": 1.1233235197906444, + "grad_norm": 0.8503665924072266, + "learning_rate": 9.94379564063276e-05, + "loss": 0.1153, + "step": 17170 + }, + { + "epoch": 1.1239777559699051, + "grad_norm": 0.7901976704597473, + "learning_rate": 9.943658211566493e-05, + "loss": 0.0984, + "step": 17180 + }, + { + "epoch": 1.1246319921491659, + "grad_norm": 0.8279184103012085, + "learning_rate": 9.943520615638797e-05, + "loss": 0.116, + "step": 17190 + }, + { + "epoch": 1.1252862283284266, + "grad_norm": 0.9072709679603577, + "learning_rate": 9.943382852854313e-05, + "loss": 0.1076, + "step": 17200 + }, + { + "epoch": 1.1259404645076874, + "grad_norm": 0.9476553797721863, + "learning_rate": 9.943244923217695e-05, + "loss": 0.102, + "step": 17210 + }, + { + "epoch": 1.126594700686948, + "grad_norm": 0.8165683746337891, + "learning_rate": 9.943106826733597e-05, + "loss": 0.1099, + "step": 17220 + }, + { + "epoch": 1.1272489368662086, + "grad_norm": 0.9307758212089539, + "learning_rate": 9.942968563406679e-05, + "loss": 0.0967, + "step": 17230 + }, + { + "epoch": 1.1279031730454694, + "grad_norm": 0.8825275897979736, + "learning_rate": 9.942830133241609e-05, + "loss": 0.1106, + "step": 17240 + }, + { + "epoch": 1.1285574092247301, + "grad_norm": 0.6875970959663391, + "learning_rate": 9.942691536243058e-05, + "loss": 0.1083, + "step": 17250 + }, + { + "epoch": 1.1292116454039909, + "grad_norm": 0.8216149806976318, + "learning_rate": 9.942552772415706e-05, + "loss": 0.1088, + "step": 17260 + }, + { + "epoch": 1.1298658815832516, + "grad_norm": 0.7268308997154236, + "learning_rate": 9.942413841764235e-05, + "loss": 0.0968, + "step": 17270 + }, + { + "epoch": 1.1305201177625124, + "grad_norm": 0.8889786601066589, + "learning_rate": 9.942274744293336e-05, + "loss": 0.0991, + "step": 17280 + }, + { + "epoch": 1.131174353941773, + "grad_norm": 0.8577876091003418, + "learning_rate": 9.942135480007701e-05, + "loss": 0.1016, + "step": 17290 + }, + { + "epoch": 1.1318285901210337, + "grad_norm": 0.9688170552253723, + "learning_rate": 9.941996048912035e-05, + "loss": 0.106, + "step": 17300 + }, + { + "epoch": 1.1324828263002944, + "grad_norm": 0.9323129653930664, + "learning_rate": 9.941856451011039e-05, + "loss": 0.1081, + "step": 17310 + }, + { + "epoch": 1.1331370624795551, + "grad_norm": 0.8865740895271301, + "learning_rate": 9.941716686309428e-05, + "loss": 0.1051, + "step": 17320 + }, + { + "epoch": 1.133791298658816, + "grad_norm": 0.912921667098999, + "learning_rate": 9.941576754811919e-05, + "loss": 0.1075, + "step": 17330 + }, + { + "epoch": 1.1344455348380766, + "grad_norm": 0.7206995487213135, + "learning_rate": 9.941436656523236e-05, + "loss": 0.107, + "step": 17340 + }, + { + "epoch": 1.1350997710173372, + "grad_norm": 0.8259555697441101, + "learning_rate": 9.941296391448103e-05, + "loss": 0.1092, + "step": 17350 + }, + { + "epoch": 1.135754007196598, + "grad_norm": 0.7500777244567871, + "learning_rate": 9.94115595959126e-05, + "loss": 0.1002, + "step": 17360 + }, + { + "epoch": 1.1364082433758587, + "grad_norm": 1.0846604108810425, + "learning_rate": 9.941015360957445e-05, + "loss": 0.1082, + "step": 17370 + }, + { + "epoch": 1.1370624795551194, + "grad_norm": 0.8614764213562012, + "learning_rate": 9.940874595551404e-05, + "loss": 0.1099, + "step": 17380 + }, + { + "epoch": 1.1377167157343802, + "grad_norm": 1.0448957681655884, + "learning_rate": 9.940733663377885e-05, + "loss": 0.1172, + "step": 17390 + }, + { + "epoch": 1.138370951913641, + "grad_norm": 0.8377976417541504, + "learning_rate": 9.940592564441649e-05, + "loss": 0.1061, + "step": 17400 + }, + { + "epoch": 1.1390251880929014, + "grad_norm": 1.1926016807556152, + "learning_rate": 9.940451298747456e-05, + "loss": 0.1077, + "step": 17410 + }, + { + "epoch": 1.1396794242721622, + "grad_norm": 0.9103050231933594, + "learning_rate": 9.940309866300075e-05, + "loss": 0.1148, + "step": 17420 + }, + { + "epoch": 1.140333660451423, + "grad_norm": 0.8122014999389648, + "learning_rate": 9.940168267104279e-05, + "loss": 0.0984, + "step": 17430 + }, + { + "epoch": 1.1409878966306837, + "grad_norm": 0.8694342374801636, + "learning_rate": 9.94002650116485e-05, + "loss": 0.0997, + "step": 17440 + }, + { + "epoch": 1.1416421328099444, + "grad_norm": 0.8882526159286499, + "learning_rate": 9.939884568486571e-05, + "loss": 0.1021, + "step": 17450 + }, + { + "epoch": 1.1422963689892052, + "grad_norm": 0.85523521900177, + "learning_rate": 9.939742469074229e-05, + "loss": 0.12, + "step": 17460 + }, + { + "epoch": 1.142950605168466, + "grad_norm": 0.9386106729507446, + "learning_rate": 9.939600202932626e-05, + "loss": 0.11, + "step": 17470 + }, + { + "epoch": 1.1436048413477264, + "grad_norm": 1.0228506326675415, + "learning_rate": 9.939457770066563e-05, + "loss": 0.0956, + "step": 17480 + }, + { + "epoch": 1.1442590775269872, + "grad_norm": 0.8513170480728149, + "learning_rate": 9.939315170480843e-05, + "loss": 0.1032, + "step": 17490 + }, + { + "epoch": 1.144913313706248, + "grad_norm": 0.8414477109909058, + "learning_rate": 9.939172404180284e-05, + "loss": 0.1, + "step": 17500 + }, + { + "epoch": 1.1455675498855087, + "grad_norm": 0.9836424589157104, + "learning_rate": 9.939029471169703e-05, + "loss": 0.0972, + "step": 17510 + }, + { + "epoch": 1.1462217860647694, + "grad_norm": 0.6796431541442871, + "learning_rate": 9.938886371453924e-05, + "loss": 0.103, + "step": 17520 + }, + { + "epoch": 1.1468760222440302, + "grad_norm": 0.7944294810295105, + "learning_rate": 9.938743105037777e-05, + "loss": 0.1015, + "step": 17530 + }, + { + "epoch": 1.147530258423291, + "grad_norm": 0.9507654309272766, + "learning_rate": 9.938599671926097e-05, + "loss": 0.1041, + "step": 17540 + }, + { + "epoch": 1.1481844946025515, + "grad_norm": 0.8638071417808533, + "learning_rate": 9.938456072123727e-05, + "loss": 0.1065, + "step": 17550 + }, + { + "epoch": 1.1488387307818122, + "grad_norm": 0.8793585896492004, + "learning_rate": 9.938312305635514e-05, + "loss": 0.0983, + "step": 17560 + }, + { + "epoch": 1.149492966961073, + "grad_norm": 0.8482651114463806, + "learning_rate": 9.938168372466307e-05, + "loss": 0.1042, + "step": 17570 + }, + { + "epoch": 1.1501472031403337, + "grad_norm": 0.889570415019989, + "learning_rate": 9.93802427262097e-05, + "loss": 0.1028, + "step": 17580 + }, + { + "epoch": 1.1508014393195944, + "grad_norm": 1.002482295036316, + "learning_rate": 9.93788000610436e-05, + "loss": 0.1081, + "step": 17590 + }, + { + "epoch": 1.151455675498855, + "grad_norm": 0.8683956265449524, + "learning_rate": 9.937735572921352e-05, + "loss": 0.1026, + "step": 17600 + }, + { + "epoch": 1.1521099116781157, + "grad_norm": 0.8563847541809082, + "learning_rate": 9.937590973076818e-05, + "loss": 0.1024, + "step": 17610 + }, + { + "epoch": 1.1527641478573765, + "grad_norm": 1.01363205909729, + "learning_rate": 9.937446206575639e-05, + "loss": 0.1075, + "step": 17620 + }, + { + "epoch": 1.1534183840366372, + "grad_norm": 0.998652458190918, + "learning_rate": 9.937301273422703e-05, + "loss": 0.1069, + "step": 17630 + }, + { + "epoch": 1.154072620215898, + "grad_norm": 0.9310935139656067, + "learning_rate": 9.937156173622899e-05, + "loss": 0.1146, + "step": 17640 + }, + { + "epoch": 1.1547268563951587, + "grad_norm": 0.9854075312614441, + "learning_rate": 9.937010907181125e-05, + "loss": 0.1147, + "step": 17650 + }, + { + "epoch": 1.1553810925744195, + "grad_norm": 1.0847511291503906, + "learning_rate": 9.936865474102289e-05, + "loss": 0.1066, + "step": 17660 + }, + { + "epoch": 1.15603532875368, + "grad_norm": 1.0146312713623047, + "learning_rate": 9.936719874391291e-05, + "loss": 0.1077, + "step": 17670 + }, + { + "epoch": 1.1566895649329407, + "grad_norm": 1.1995619535446167, + "learning_rate": 9.936574108053054e-05, + "loss": 0.1099, + "step": 17680 + }, + { + "epoch": 1.1573438011122015, + "grad_norm": 0.8147319555282593, + "learning_rate": 9.936428175092491e-05, + "loss": 0.1107, + "step": 17690 + }, + { + "epoch": 1.1579980372914622, + "grad_norm": 1.276146650314331, + "learning_rate": 9.936282075514534e-05, + "loss": 0.1206, + "step": 17700 + }, + { + "epoch": 1.158652273470723, + "grad_norm": 0.8899917602539062, + "learning_rate": 9.93613580932411e-05, + "loss": 0.1001, + "step": 17710 + }, + { + "epoch": 1.1593065096499837, + "grad_norm": 1.1056793928146362, + "learning_rate": 9.935989376526156e-05, + "loss": 0.1106, + "step": 17720 + }, + { + "epoch": 1.1599607458292445, + "grad_norm": 0.9014807343482971, + "learning_rate": 9.935842777125615e-05, + "loss": 0.1002, + "step": 17730 + }, + { + "epoch": 1.160614982008505, + "grad_norm": 0.9449447393417358, + "learning_rate": 9.935696011127438e-05, + "loss": 0.1039, + "step": 17740 + }, + { + "epoch": 1.1612692181877657, + "grad_norm": 0.8191094994544983, + "learning_rate": 9.935549078536574e-05, + "loss": 0.1099, + "step": 17750 + }, + { + "epoch": 1.1619234543670265, + "grad_norm": 0.7725281715393066, + "learning_rate": 9.935401979357986e-05, + "loss": 0.096, + "step": 17760 + }, + { + "epoch": 1.1625776905462872, + "grad_norm": 0.8660596609115601, + "learning_rate": 9.935254713596637e-05, + "loss": 0.1046, + "step": 17770 + }, + { + "epoch": 1.163231926725548, + "grad_norm": 1.0237655639648438, + "learning_rate": 9.935107281257498e-05, + "loss": 0.1121, + "step": 17780 + }, + { + "epoch": 1.1638861629048087, + "grad_norm": 1.306152582168579, + "learning_rate": 9.934959682345546e-05, + "loss": 0.1007, + "step": 17790 + }, + { + "epoch": 1.1645403990840693, + "grad_norm": 0.8070068359375, + "learning_rate": 9.934811916865763e-05, + "loss": 0.1075, + "step": 17800 + }, + { + "epoch": 1.16519463526333, + "grad_norm": 0.9843347668647766, + "learning_rate": 9.934663984823133e-05, + "loss": 0.1066, + "step": 17810 + }, + { + "epoch": 1.1658488714425908, + "grad_norm": 0.8091020584106445, + "learning_rate": 9.934515886222655e-05, + "loss": 0.0987, + "step": 17820 + }, + { + "epoch": 1.1665031076218515, + "grad_norm": 0.9831781983375549, + "learning_rate": 9.934367621069322e-05, + "loss": 0.1053, + "step": 17830 + }, + { + "epoch": 1.1671573438011122, + "grad_norm": 0.9934841394424438, + "learning_rate": 9.934219189368143e-05, + "loss": 0.1143, + "step": 17840 + }, + { + "epoch": 1.167811579980373, + "grad_norm": 0.9186519980430603, + "learning_rate": 9.934070591124127e-05, + "loss": 0.108, + "step": 17850 + }, + { + "epoch": 1.1684658161596335, + "grad_norm": 0.9979751706123352, + "learning_rate": 9.933921826342286e-05, + "loss": 0.1146, + "step": 17860 + }, + { + "epoch": 1.1691200523388943, + "grad_norm": 0.8182001709938049, + "learning_rate": 9.933772895027644e-05, + "loss": 0.0996, + "step": 17870 + }, + { + "epoch": 1.169774288518155, + "grad_norm": 0.9773245453834534, + "learning_rate": 9.933623797185228e-05, + "loss": 0.1056, + "step": 17880 + }, + { + "epoch": 1.1704285246974158, + "grad_norm": 0.8775014281272888, + "learning_rate": 9.933474532820071e-05, + "loss": 0.1121, + "step": 17890 + }, + { + "epoch": 1.1710827608766765, + "grad_norm": 0.8223180174827576, + "learning_rate": 9.933325101937207e-05, + "loss": 0.0976, + "step": 17900 + }, + { + "epoch": 1.1717369970559373, + "grad_norm": 0.8393526673316956, + "learning_rate": 9.933175504541686e-05, + "loss": 0.1034, + "step": 17910 + }, + { + "epoch": 1.172391233235198, + "grad_norm": 0.8224334716796875, + "learning_rate": 9.933025740638551e-05, + "loss": 0.1118, + "step": 17920 + }, + { + "epoch": 1.1730454694144585, + "grad_norm": 0.7815266847610474, + "learning_rate": 9.932875810232863e-05, + "loss": 0.1002, + "step": 17930 + }, + { + "epoch": 1.1736997055937193, + "grad_norm": 0.9696399569511414, + "learning_rate": 9.932725713329678e-05, + "loss": 0.1071, + "step": 17940 + }, + { + "epoch": 1.17435394177298, + "grad_norm": 0.9797826409339905, + "learning_rate": 9.932575449934062e-05, + "loss": 0.1058, + "step": 17950 + }, + { + "epoch": 1.1750081779522408, + "grad_norm": 0.90691077709198, + "learning_rate": 9.93242502005109e-05, + "loss": 0.1041, + "step": 17960 + }, + { + "epoch": 1.1756624141315015, + "grad_norm": 0.8655257821083069, + "learning_rate": 9.932274423685839e-05, + "loss": 0.1105, + "step": 17970 + }, + { + "epoch": 1.1763166503107623, + "grad_norm": 0.9878515005111694, + "learning_rate": 9.932123660843389e-05, + "loss": 0.119, + "step": 17980 + }, + { + "epoch": 1.176970886490023, + "grad_norm": 1.0736156702041626, + "learning_rate": 9.931972731528831e-05, + "loss": 0.1112, + "step": 17990 + }, + { + "epoch": 1.1776251226692835, + "grad_norm": 0.9092381596565247, + "learning_rate": 9.931821635747259e-05, + "loss": 0.1154, + "step": 18000 + }, + { + "epoch": 1.1782793588485443, + "grad_norm": 0.9012355208396912, + "learning_rate": 9.931670373503771e-05, + "loss": 0.1123, + "step": 18010 + }, + { + "epoch": 1.178933595027805, + "grad_norm": 0.8360759019851685, + "learning_rate": 9.931518944803477e-05, + "loss": 0.1086, + "step": 18020 + }, + { + "epoch": 1.1795878312070658, + "grad_norm": 1.0164974927902222, + "learning_rate": 9.931367349651484e-05, + "loss": 0.1137, + "step": 18030 + }, + { + "epoch": 1.1802420673863265, + "grad_norm": 0.7645466327667236, + "learning_rate": 9.93121558805291e-05, + "loss": 0.0971, + "step": 18040 + }, + { + "epoch": 1.180896303565587, + "grad_norm": 0.8536872863769531, + "learning_rate": 9.931063660012875e-05, + "loss": 0.1113, + "step": 18050 + }, + { + "epoch": 1.1815505397448478, + "grad_norm": 0.6680562496185303, + "learning_rate": 9.930911565536513e-05, + "loss": 0.0963, + "step": 18060 + }, + { + "epoch": 1.1822047759241086, + "grad_norm": 0.9181721210479736, + "learning_rate": 9.930759304628952e-05, + "loss": 0.1102, + "step": 18070 + }, + { + "epoch": 1.1828590121033693, + "grad_norm": 0.9609587788581848, + "learning_rate": 9.930606877295334e-05, + "loss": 0.0972, + "step": 18080 + }, + { + "epoch": 1.18351324828263, + "grad_norm": 0.8478082418441772, + "learning_rate": 9.930454283540802e-05, + "loss": 0.0914, + "step": 18090 + }, + { + "epoch": 1.1841674844618908, + "grad_norm": 0.7738621830940247, + "learning_rate": 9.930301523370507e-05, + "loss": 0.11, + "step": 18100 + }, + { + "epoch": 1.1848217206411515, + "grad_norm": 0.8491454720497131, + "learning_rate": 9.930148596789605e-05, + "loss": 0.1067, + "step": 18110 + }, + { + "epoch": 1.185475956820412, + "grad_norm": 0.9434249401092529, + "learning_rate": 9.92999550380326e-05, + "loss": 0.1046, + "step": 18120 + }, + { + "epoch": 1.1861301929996728, + "grad_norm": 1.3511039018630981, + "learning_rate": 9.929842244416636e-05, + "loss": 0.1087, + "step": 18130 + }, + { + "epoch": 1.1867844291789336, + "grad_norm": 0.9221027493476868, + "learning_rate": 9.929688818634907e-05, + "loss": 0.1059, + "step": 18140 + }, + { + "epoch": 1.1874386653581943, + "grad_norm": 0.8466002941131592, + "learning_rate": 9.929535226463253e-05, + "loss": 0.1008, + "step": 18150 + }, + { + "epoch": 1.188092901537455, + "grad_norm": 0.9640662670135498, + "learning_rate": 9.929381467906858e-05, + "loss": 0.1137, + "step": 18160 + }, + { + "epoch": 1.1887471377167158, + "grad_norm": 0.9665100574493408, + "learning_rate": 9.929227542970908e-05, + "loss": 0.0976, + "step": 18170 + }, + { + "epoch": 1.1894013738959766, + "grad_norm": 1.027797818183899, + "learning_rate": 9.929073451660602e-05, + "loss": 0.101, + "step": 18180 + }, + { + "epoch": 1.190055610075237, + "grad_norm": 0.7733204364776611, + "learning_rate": 9.928919193981141e-05, + "loss": 0.105, + "step": 18190 + }, + { + "epoch": 1.1907098462544978, + "grad_norm": 0.874467670917511, + "learning_rate": 9.928764769937729e-05, + "loss": 0.105, + "step": 18200 + }, + { + "epoch": 1.1913640824337586, + "grad_norm": 0.8329676985740662, + "learning_rate": 9.92861017953558e-05, + "loss": 0.094, + "step": 18210 + }, + { + "epoch": 1.1920183186130193, + "grad_norm": 0.9815132021903992, + "learning_rate": 9.928455422779913e-05, + "loss": 0.1056, + "step": 18220 + }, + { + "epoch": 1.19267255479228, + "grad_norm": 0.8479218482971191, + "learning_rate": 9.92830049967595e-05, + "loss": 0.1049, + "step": 18230 + }, + { + "epoch": 1.1933267909715408, + "grad_norm": 1.3466743230819702, + "learning_rate": 9.928145410228919e-05, + "loss": 0.1031, + "step": 18240 + }, + { + "epoch": 1.1939810271508013, + "grad_norm": 0.9225918650627136, + "learning_rate": 9.927990154444056e-05, + "loss": 0.1104, + "step": 18250 + }, + { + "epoch": 1.194635263330062, + "grad_norm": 1.143968939781189, + "learning_rate": 9.927834732326602e-05, + "loss": 0.1122, + "step": 18260 + }, + { + "epoch": 1.1952894995093228, + "grad_norm": 0.9355697631835938, + "learning_rate": 9.927679143881802e-05, + "loss": 0.0969, + "step": 18270 + }, + { + "epoch": 1.1959437356885836, + "grad_norm": 0.9907440543174744, + "learning_rate": 9.927523389114908e-05, + "loss": 0.1212, + "step": 18280 + }, + { + "epoch": 1.1965979718678443, + "grad_norm": 0.8513557314872742, + "learning_rate": 9.927367468031175e-05, + "loss": 0.0925, + "step": 18290 + }, + { + "epoch": 1.197252208047105, + "grad_norm": 0.790827751159668, + "learning_rate": 9.927211380635869e-05, + "loss": 0.0964, + "step": 18300 + }, + { + "epoch": 1.1979064442263656, + "grad_norm": 1.2124277353286743, + "learning_rate": 9.927055126934256e-05, + "loss": 0.0996, + "step": 18310 + }, + { + "epoch": 1.1985606804056264, + "grad_norm": 1.0649062395095825, + "learning_rate": 9.92689870693161e-05, + "loss": 0.0976, + "step": 18320 + }, + { + "epoch": 1.199214916584887, + "grad_norm": 1.0294406414031982, + "learning_rate": 9.926742120633214e-05, + "loss": 0.1023, + "step": 18330 + }, + { + "epoch": 1.1998691527641479, + "grad_norm": 0.8996325135231018, + "learning_rate": 9.92658536804435e-05, + "loss": 0.0976, + "step": 18340 + }, + { + "epoch": 1.2005233889434086, + "grad_norm": 0.8989105820655823, + "learning_rate": 9.926428449170308e-05, + "loss": 0.1096, + "step": 18350 + }, + { + "epoch": 1.2011776251226693, + "grad_norm": 0.7884949445724487, + "learning_rate": 9.926271364016386e-05, + "loss": 0.1116, + "step": 18360 + }, + { + "epoch": 1.20183186130193, + "grad_norm": 0.8274450302124023, + "learning_rate": 9.926114112587886e-05, + "loss": 0.1015, + "step": 18370 + }, + { + "epoch": 1.2024860974811906, + "grad_norm": 1.003177285194397, + "learning_rate": 9.925956694890115e-05, + "loss": 0.1233, + "step": 18380 + }, + { + "epoch": 1.2031403336604514, + "grad_norm": 0.9113197326660156, + "learning_rate": 9.925799110928388e-05, + "loss": 0.1148, + "step": 18390 + }, + { + "epoch": 1.2037945698397121, + "grad_norm": 0.8547518849372864, + "learning_rate": 9.925641360708021e-05, + "loss": 0.1136, + "step": 18400 + }, + { + "epoch": 1.2044488060189729, + "grad_norm": 0.7965877056121826, + "learning_rate": 9.925483444234341e-05, + "loss": 0.0907, + "step": 18410 + }, + { + "epoch": 1.2051030421982336, + "grad_norm": 0.9823962450027466, + "learning_rate": 9.925325361512679e-05, + "loss": 0.0995, + "step": 18420 + }, + { + "epoch": 1.2057572783774944, + "grad_norm": 0.8240478038787842, + "learning_rate": 9.925167112548365e-05, + "loss": 0.1053, + "step": 18430 + }, + { + "epoch": 1.206411514556755, + "grad_norm": 0.8721191883087158, + "learning_rate": 9.925008697346748e-05, + "loss": 0.1069, + "step": 18440 + }, + { + "epoch": 1.2070657507360156, + "grad_norm": 0.9270473718643188, + "learning_rate": 9.924850115913169e-05, + "loss": 0.1148, + "step": 18450 + }, + { + "epoch": 1.2077199869152764, + "grad_norm": 0.7241986989974976, + "learning_rate": 9.924691368252983e-05, + "loss": 0.1004, + "step": 18460 + }, + { + "epoch": 1.2083742230945371, + "grad_norm": 0.8268777132034302, + "learning_rate": 9.924532454371549e-05, + "loss": 0.0935, + "step": 18470 + }, + { + "epoch": 1.2090284592737979, + "grad_norm": 0.8865098357200623, + "learning_rate": 9.924373374274228e-05, + "loss": 0.1058, + "step": 18480 + }, + { + "epoch": 1.2096826954530586, + "grad_norm": 0.8308011293411255, + "learning_rate": 9.924214127966391e-05, + "loss": 0.1022, + "step": 18490 + }, + { + "epoch": 1.2103369316323191, + "grad_norm": 0.8910857439041138, + "learning_rate": 9.924054715453414e-05, + "loss": 0.0998, + "step": 18500 + }, + { + "epoch": 1.21099116781158, + "grad_norm": 0.8274952173233032, + "learning_rate": 9.923895136740676e-05, + "loss": 0.1031, + "step": 18510 + }, + { + "epoch": 1.2116454039908406, + "grad_norm": 1.0838192701339722, + "learning_rate": 9.923735391833564e-05, + "loss": 0.1024, + "step": 18520 + }, + { + "epoch": 1.2122996401701014, + "grad_norm": 0.8322728276252747, + "learning_rate": 9.92357548073747e-05, + "loss": 0.1234, + "step": 18530 + }, + { + "epoch": 1.2129538763493621, + "grad_norm": 0.8210560083389282, + "learning_rate": 9.923415403457789e-05, + "loss": 0.1051, + "step": 18540 + }, + { + "epoch": 1.2136081125286229, + "grad_norm": 0.8708292245864868, + "learning_rate": 9.92325515999993e-05, + "loss": 0.0952, + "step": 18550 + }, + { + "epoch": 1.2142623487078836, + "grad_norm": 0.8271515965461731, + "learning_rate": 9.923094750369293e-05, + "loss": 0.1003, + "step": 18560 + }, + { + "epoch": 1.2149165848871442, + "grad_norm": 0.7365519404411316, + "learning_rate": 9.9229341745713e-05, + "loss": 0.1061, + "step": 18570 + }, + { + "epoch": 1.215570821066405, + "grad_norm": 1.1527734994888306, + "learning_rate": 9.922773432611366e-05, + "loss": 0.1127, + "step": 18580 + }, + { + "epoch": 1.2162250572456657, + "grad_norm": 0.7652072310447693, + "learning_rate": 9.922612524494919e-05, + "loss": 0.1048, + "step": 18590 + }, + { + "epoch": 1.2168792934249264, + "grad_norm": 0.8632088899612427, + "learning_rate": 9.92245145022739e-05, + "loss": 0.101, + "step": 18600 + }, + { + "epoch": 1.2175335296041871, + "grad_norm": 0.8901262283325195, + "learning_rate": 9.922290209814214e-05, + "loss": 0.0977, + "step": 18610 + }, + { + "epoch": 1.218187765783448, + "grad_norm": 0.8626736402511597, + "learning_rate": 9.922128803260833e-05, + "loss": 0.0963, + "step": 18620 + }, + { + "epoch": 1.2188420019627086, + "grad_norm": 0.8524816632270813, + "learning_rate": 9.921967230572699e-05, + "loss": 0.1004, + "step": 18630 + }, + { + "epoch": 1.2194962381419692, + "grad_norm": 0.6984155774116516, + "learning_rate": 9.92180549175526e-05, + "loss": 0.1046, + "step": 18640 + }, + { + "epoch": 1.22015047432123, + "grad_norm": 0.9021031856536865, + "learning_rate": 9.92164358681398e-05, + "loss": 0.096, + "step": 18650 + }, + { + "epoch": 1.2208047105004907, + "grad_norm": 0.8669722676277161, + "learning_rate": 9.921481515754321e-05, + "loss": 0.1071, + "step": 18660 + }, + { + "epoch": 1.2214589466797514, + "grad_norm": 0.8371648788452148, + "learning_rate": 9.921319278581752e-05, + "loss": 0.1051, + "step": 18670 + }, + { + "epoch": 1.2221131828590122, + "grad_norm": 0.8591510653495789, + "learning_rate": 9.921156875301751e-05, + "loss": 0.0984, + "step": 18680 + }, + { + "epoch": 1.222767419038273, + "grad_norm": 0.7590886950492859, + "learning_rate": 9.920994305919801e-05, + "loss": 0.1065, + "step": 18690 + }, + { + "epoch": 1.2234216552175334, + "grad_norm": 0.9001472592353821, + "learning_rate": 9.920831570441387e-05, + "loss": 0.0941, + "step": 18700 + }, + { + "epoch": 1.2240758913967942, + "grad_norm": 1.0100491046905518, + "learning_rate": 9.920668668872002e-05, + "loss": 0.0984, + "step": 18710 + }, + { + "epoch": 1.224730127576055, + "grad_norm": 0.9002352952957153, + "learning_rate": 9.920505601217143e-05, + "loss": 0.1018, + "step": 18720 + }, + { + "epoch": 1.2253843637553157, + "grad_norm": 0.8079519867897034, + "learning_rate": 9.920342367482318e-05, + "loss": 0.1088, + "step": 18730 + }, + { + "epoch": 1.2260385999345764, + "grad_norm": 0.9378504157066345, + "learning_rate": 9.920178967673031e-05, + "loss": 0.1105, + "step": 18740 + }, + { + "epoch": 1.2266928361138372, + "grad_norm": 0.9143635034561157, + "learning_rate": 9.920015401794803e-05, + "loss": 0.0919, + "step": 18750 + }, + { + "epoch": 1.2273470722930977, + "grad_norm": 0.8441551923751831, + "learning_rate": 9.91985166985315e-05, + "loss": 0.097, + "step": 18760 + }, + { + "epoch": 1.2280013084723584, + "grad_norm": 0.8805584907531738, + "learning_rate": 9.919687771853601e-05, + "loss": 0.1003, + "step": 18770 + }, + { + "epoch": 1.2286555446516192, + "grad_norm": 1.0052324533462524, + "learning_rate": 9.919523707801687e-05, + "loss": 0.1009, + "step": 18780 + }, + { + "epoch": 1.22930978083088, + "grad_norm": 0.9868443608283997, + "learning_rate": 9.919359477702948e-05, + "loss": 0.1163, + "step": 18790 + }, + { + "epoch": 1.2299640170101407, + "grad_norm": 1.144353985786438, + "learning_rate": 9.919195081562924e-05, + "loss": 0.1081, + "step": 18800 + }, + { + "epoch": 1.2306182531894014, + "grad_norm": 1.0622575283050537, + "learning_rate": 9.919030519387164e-05, + "loss": 0.0981, + "step": 18810 + }, + { + "epoch": 1.2312724893686622, + "grad_norm": 0.8485022783279419, + "learning_rate": 9.918865791181224e-05, + "loss": 0.1029, + "step": 18820 + }, + { + "epoch": 1.2319267255479227, + "grad_norm": 1.022147536277771, + "learning_rate": 9.918700896950664e-05, + "loss": 0.1053, + "step": 18830 + }, + { + "epoch": 1.2325809617271835, + "grad_norm": 0.8381478190422058, + "learning_rate": 9.918535836701047e-05, + "loss": 0.1024, + "step": 18840 + }, + { + "epoch": 1.2332351979064442, + "grad_norm": 0.7995995879173279, + "learning_rate": 9.918370610437948e-05, + "loss": 0.0961, + "step": 18850 + }, + { + "epoch": 1.233889434085705, + "grad_norm": 1.0238806009292603, + "learning_rate": 9.91820521816694e-05, + "loss": 0.1049, + "step": 18860 + }, + { + "epoch": 1.2345436702649657, + "grad_norm": 1.0174421072006226, + "learning_rate": 9.91803965989361e-05, + "loss": 0.0893, + "step": 18870 + }, + { + "epoch": 1.2351979064442264, + "grad_norm": 1.1415332555770874, + "learning_rate": 9.917873935623542e-05, + "loss": 0.1049, + "step": 18880 + }, + { + "epoch": 1.2358521426234872, + "grad_norm": 0.7645314931869507, + "learning_rate": 9.91770804536233e-05, + "loss": 0.1065, + "step": 18890 + }, + { + "epoch": 1.2365063788027477, + "grad_norm": 0.9801765084266663, + "learning_rate": 9.917541989115578e-05, + "loss": 0.1038, + "step": 18900 + }, + { + "epoch": 1.2371606149820085, + "grad_norm": 0.9121055603027344, + "learning_rate": 9.917375766888883e-05, + "loss": 0.1095, + "step": 18910 + }, + { + "epoch": 1.2378148511612692, + "grad_norm": 0.7746009230613708, + "learning_rate": 9.917209378687862e-05, + "loss": 0.0968, + "step": 18920 + }, + { + "epoch": 1.23846908734053, + "grad_norm": 0.8752132058143616, + "learning_rate": 9.917042824518127e-05, + "loss": 0.1014, + "step": 18930 + }, + { + "epoch": 1.2391233235197907, + "grad_norm": 0.8438636660575867, + "learning_rate": 9.916876104385303e-05, + "loss": 0.1035, + "step": 18940 + }, + { + "epoch": 1.2397775596990512, + "grad_norm": 1.1780970096588135, + "learning_rate": 9.916709218295015e-05, + "loss": 0.1077, + "step": 18950 + }, + { + "epoch": 1.240431795878312, + "grad_norm": 0.9799401164054871, + "learning_rate": 9.916542166252895e-05, + "loss": 0.1137, + "step": 18960 + }, + { + "epoch": 1.2410860320575727, + "grad_norm": 0.6996291875839233, + "learning_rate": 9.916374948264584e-05, + "loss": 0.1007, + "step": 18970 + }, + { + "epoch": 1.2417402682368335, + "grad_norm": 0.7507869601249695, + "learning_rate": 9.916207564335726e-05, + "loss": 0.0997, + "step": 18980 + }, + { + "epoch": 1.2423945044160942, + "grad_norm": 0.8153190016746521, + "learning_rate": 9.916040014471968e-05, + "loss": 0.0974, + "step": 18990 + }, + { + "epoch": 1.243048740595355, + "grad_norm": 0.7431339621543884, + "learning_rate": 9.915872298678966e-05, + "loss": 0.092, + "step": 19000 + }, + { + "epoch": 1.2437029767746157, + "grad_norm": 1.1183733940124512, + "learning_rate": 9.915704416962382e-05, + "loss": 0.1045, + "step": 19010 + }, + { + "epoch": 1.2443572129538762, + "grad_norm": 1.058430790901184, + "learning_rate": 9.915536369327883e-05, + "loss": 0.1204, + "step": 19020 + }, + { + "epoch": 1.245011449133137, + "grad_norm": 0.9192705750465393, + "learning_rate": 9.91536815578114e-05, + "loss": 0.113, + "step": 19030 + }, + { + "epoch": 1.2456656853123977, + "grad_norm": 0.899825394153595, + "learning_rate": 9.91519977632783e-05, + "loss": 0.0965, + "step": 19040 + }, + { + "epoch": 1.2463199214916585, + "grad_norm": 0.8516882061958313, + "learning_rate": 9.915031230973637e-05, + "loss": 0.1042, + "step": 19050 + }, + { + "epoch": 1.2469741576709192, + "grad_norm": 0.8238911032676697, + "learning_rate": 9.914862519724251e-05, + "loss": 0.1051, + "step": 19060 + }, + { + "epoch": 1.24762839385018, + "grad_norm": 0.8177777528762817, + "learning_rate": 9.914693642585364e-05, + "loss": 0.1074, + "step": 19070 + }, + { + "epoch": 1.2482826300294407, + "grad_norm": 1.2330362796783447, + "learning_rate": 9.914524599562677e-05, + "loss": 0.1078, + "step": 19080 + }, + { + "epoch": 1.2489368662087013, + "grad_norm": 1.126373529434204, + "learning_rate": 9.914355390661896e-05, + "loss": 0.1013, + "step": 19090 + }, + { + "epoch": 1.249591102387962, + "grad_norm": 0.8344969153404236, + "learning_rate": 9.914186015888733e-05, + "loss": 0.1009, + "step": 19100 + }, + { + "epoch": 1.2502453385672228, + "grad_norm": 0.8182376027107239, + "learning_rate": 9.914016475248904e-05, + "loss": 0.0968, + "step": 19110 + }, + { + "epoch": 1.2508995747464835, + "grad_norm": 0.9014137387275696, + "learning_rate": 9.913846768748132e-05, + "loss": 0.1089, + "step": 19120 + }, + { + "epoch": 1.2515538109257442, + "grad_norm": 0.9868068695068359, + "learning_rate": 9.913676896392144e-05, + "loss": 0.1141, + "step": 19130 + }, + { + "epoch": 1.2522080471050048, + "grad_norm": 1.0246328115463257, + "learning_rate": 9.913506858186673e-05, + "loss": 0.105, + "step": 19140 + }, + { + "epoch": 1.2528622832842657, + "grad_norm": 0.783169686794281, + "learning_rate": 9.913336654137459e-05, + "loss": 0.0996, + "step": 19150 + }, + { + "epoch": 1.2535165194635263, + "grad_norm": 0.8389064073562622, + "learning_rate": 9.913166284250251e-05, + "loss": 0.089, + "step": 19160 + }, + { + "epoch": 1.254170755642787, + "grad_norm": 0.8309929966926575, + "learning_rate": 9.912995748530792e-05, + "loss": 0.0968, + "step": 19170 + }, + { + "epoch": 1.2548249918220478, + "grad_norm": 0.9536809325218201, + "learning_rate": 9.912825046984842e-05, + "loss": 0.1027, + "step": 19180 + }, + { + "epoch": 1.2554792280013085, + "grad_norm": 1.0172020196914673, + "learning_rate": 9.912654179618164e-05, + "loss": 0.1111, + "step": 19190 + }, + { + "epoch": 1.2561334641805693, + "grad_norm": 0.8524767160415649, + "learning_rate": 9.912483146436522e-05, + "loss": 0.0941, + "step": 19200 + }, + { + "epoch": 1.2567877003598298, + "grad_norm": 0.9486467242240906, + "learning_rate": 9.912311947445692e-05, + "loss": 0.1148, + "step": 19210 + }, + { + "epoch": 1.2574419365390905, + "grad_norm": 0.9282219409942627, + "learning_rate": 9.91214058265145e-05, + "loss": 0.1087, + "step": 19220 + }, + { + "epoch": 1.2580961727183513, + "grad_norm": 0.9173193573951721, + "learning_rate": 9.911969052059579e-05, + "loss": 0.1129, + "step": 19230 + }, + { + "epoch": 1.258750408897612, + "grad_norm": 0.8629287481307983, + "learning_rate": 9.911797355675874e-05, + "loss": 0.109, + "step": 19240 + }, + { + "epoch": 1.2594046450768728, + "grad_norm": 0.8385807871818542, + "learning_rate": 9.911625493506124e-05, + "loss": 0.1052, + "step": 19250 + }, + { + "epoch": 1.2600588812561335, + "grad_norm": 0.7391451597213745, + "learning_rate": 9.911453465556133e-05, + "loss": 0.1045, + "step": 19260 + }, + { + "epoch": 1.2607131174353943, + "grad_norm": 0.8053876757621765, + "learning_rate": 9.911281271831707e-05, + "loss": 0.1016, + "step": 19270 + }, + { + "epoch": 1.2613673536146548, + "grad_norm": 0.8911042809486389, + "learning_rate": 9.911108912338657e-05, + "loss": 0.1077, + "step": 19280 + }, + { + "epoch": 1.2620215897939155, + "grad_norm": 0.8589571118354797, + "learning_rate": 9.910936387082802e-05, + "loss": 0.1093, + "step": 19290 + }, + { + "epoch": 1.2626758259731763, + "grad_norm": 0.8094229102134705, + "learning_rate": 9.910763696069965e-05, + "loss": 0.1096, + "step": 19300 + }, + { + "epoch": 1.263330062152437, + "grad_norm": 0.8509666323661804, + "learning_rate": 9.910590839305973e-05, + "loss": 0.1072, + "step": 19310 + }, + { + "epoch": 1.2639842983316978, + "grad_norm": 0.9664101600646973, + "learning_rate": 9.910417816796662e-05, + "loss": 0.1019, + "step": 19320 + }, + { + "epoch": 1.2646385345109585, + "grad_norm": 0.8401138782501221, + "learning_rate": 9.910244628547872e-05, + "loss": 0.1015, + "step": 19330 + }, + { + "epoch": 1.2652927706902193, + "grad_norm": 0.7719675898551941, + "learning_rate": 9.910071274565449e-05, + "loss": 0.1116, + "step": 19340 + }, + { + "epoch": 1.2659470068694798, + "grad_norm": 1.0611908435821533, + "learning_rate": 9.909897754855242e-05, + "loss": 0.0963, + "step": 19350 + }, + { + "epoch": 1.2666012430487406, + "grad_norm": 0.9017995595932007, + "learning_rate": 9.90972406942311e-05, + "loss": 0.1046, + "step": 19360 + }, + { + "epoch": 1.2672554792280013, + "grad_norm": 0.7480419278144836, + "learning_rate": 9.909550218274915e-05, + "loss": 0.0987, + "step": 19370 + }, + { + "epoch": 1.267909715407262, + "grad_norm": 0.949092447757721, + "learning_rate": 9.909376201416522e-05, + "loss": 0.1129, + "step": 19380 + }, + { + "epoch": 1.2685639515865228, + "grad_norm": 0.8834240436553955, + "learning_rate": 9.909202018853809e-05, + "loss": 0.1007, + "step": 19390 + }, + { + "epoch": 1.2692181877657833, + "grad_norm": 0.8566089272499084, + "learning_rate": 9.909027670592652e-05, + "loss": 0.1074, + "step": 19400 + }, + { + "epoch": 1.2698724239450443, + "grad_norm": 0.8526421785354614, + "learning_rate": 9.908853156638937e-05, + "loss": 0.116, + "step": 19410 + }, + { + "epoch": 1.2705266601243048, + "grad_norm": 0.8203496932983398, + "learning_rate": 9.908678476998555e-05, + "loss": 0.098, + "step": 19420 + }, + { + "epoch": 1.2711808963035656, + "grad_norm": 0.8378643989562988, + "learning_rate": 9.908503631677399e-05, + "loss": 0.1081, + "step": 19430 + }, + { + "epoch": 1.2718351324828263, + "grad_norm": 1.0013712644577026, + "learning_rate": 9.908328620681373e-05, + "loss": 0.1006, + "step": 19440 + }, + { + "epoch": 1.272489368662087, + "grad_norm": 0.9886232018470764, + "learning_rate": 9.908153444016385e-05, + "loss": 0.118, + "step": 19450 + }, + { + "epoch": 1.2731436048413478, + "grad_norm": 0.9862875938415527, + "learning_rate": 9.907978101688344e-05, + "loss": 0.1166, + "step": 19460 + }, + { + "epoch": 1.2737978410206083, + "grad_norm": 0.935799241065979, + "learning_rate": 9.907802593703173e-05, + "loss": 0.0968, + "step": 19470 + }, + { + "epoch": 1.274452077199869, + "grad_norm": 0.8719044923782349, + "learning_rate": 9.90762692006679e-05, + "loss": 0.0947, + "step": 19480 + }, + { + "epoch": 1.2751063133791298, + "grad_norm": 0.7218495607376099, + "learning_rate": 9.90745108078513e-05, + "loss": 0.0911, + "step": 19490 + }, + { + "epoch": 1.2757605495583906, + "grad_norm": 0.8420484066009521, + "learning_rate": 9.907275075864127e-05, + "loss": 0.1092, + "step": 19500 + }, + { + "epoch": 1.2764147857376513, + "grad_norm": 0.808154821395874, + "learning_rate": 9.907098905309718e-05, + "loss": 0.0971, + "step": 19510 + }, + { + "epoch": 1.277069021916912, + "grad_norm": 0.7025526165962219, + "learning_rate": 9.906922569127853e-05, + "loss": 0.108, + "step": 19520 + }, + { + "epoch": 1.2777232580961728, + "grad_norm": 0.9783757925033569, + "learning_rate": 9.906746067324481e-05, + "loss": 0.1065, + "step": 19530 + }, + { + "epoch": 1.2783774942754333, + "grad_norm": 0.8030864000320435, + "learning_rate": 9.906569399905561e-05, + "loss": 0.1054, + "step": 19540 + }, + { + "epoch": 1.279031730454694, + "grad_norm": 0.8669215440750122, + "learning_rate": 9.906392566877057e-05, + "loss": 0.0978, + "step": 19550 + }, + { + "epoch": 1.2796859666339548, + "grad_norm": 0.8483967781066895, + "learning_rate": 9.906215568244935e-05, + "loss": 0.1098, + "step": 19560 + }, + { + "epoch": 1.2803402028132156, + "grad_norm": 0.7597243785858154, + "learning_rate": 9.906038404015171e-05, + "loss": 0.106, + "step": 19570 + }, + { + "epoch": 1.2809944389924763, + "grad_norm": 0.9336119294166565, + "learning_rate": 9.905861074193745e-05, + "loss": 0.0995, + "step": 19580 + }, + { + "epoch": 1.2816486751717369, + "grad_norm": 0.8763530254364014, + "learning_rate": 9.905683578786641e-05, + "loss": 0.0976, + "step": 19590 + }, + { + "epoch": 1.2823029113509978, + "grad_norm": 0.9109959602355957, + "learning_rate": 9.905505917799851e-05, + "loss": 0.1021, + "step": 19600 + }, + { + "epoch": 1.2829571475302584, + "grad_norm": 1.0181946754455566, + "learning_rate": 9.90532809123937e-05, + "loss": 0.1142, + "step": 19610 + }, + { + "epoch": 1.283611383709519, + "grad_norm": 1.0679359436035156, + "learning_rate": 9.905150099111202e-05, + "loss": 0.1152, + "step": 19620 + }, + { + "epoch": 1.2842656198887799, + "grad_norm": 0.8650368452072144, + "learning_rate": 9.904971941421355e-05, + "loss": 0.0966, + "step": 19630 + }, + { + "epoch": 1.2849198560680406, + "grad_norm": 1.0523502826690674, + "learning_rate": 9.90479361817584e-05, + "loss": 0.104, + "step": 19640 + }, + { + "epoch": 1.2855740922473013, + "grad_norm": 0.976112961769104, + "learning_rate": 9.904615129380676e-05, + "loss": 0.0967, + "step": 19650 + }, + { + "epoch": 1.2862283284265619, + "grad_norm": 0.8600479364395142, + "learning_rate": 9.904436475041891e-05, + "loss": 0.105, + "step": 19660 + }, + { + "epoch": 1.2868825646058226, + "grad_norm": 1.066514015197754, + "learning_rate": 9.904257655165512e-05, + "loss": 0.1022, + "step": 19670 + }, + { + "epoch": 1.2875368007850834, + "grad_norm": 0.9571858644485474, + "learning_rate": 9.904078669757575e-05, + "loss": 0.1039, + "step": 19680 + }, + { + "epoch": 1.2881910369643441, + "grad_norm": 0.8673845529556274, + "learning_rate": 9.90389951882412e-05, + "loss": 0.0921, + "step": 19690 + }, + { + "epoch": 1.2888452731436049, + "grad_norm": 0.995274007320404, + "learning_rate": 9.903720202371198e-05, + "loss": 0.1015, + "step": 19700 + }, + { + "epoch": 1.2894995093228656, + "grad_norm": 0.9427167773246765, + "learning_rate": 9.903540720404856e-05, + "loss": 0.0934, + "step": 19710 + }, + { + "epoch": 1.2901537455021264, + "grad_norm": 0.8194208145141602, + "learning_rate": 9.903361072931156e-05, + "loss": 0.0943, + "step": 19720 + }, + { + "epoch": 1.2908079816813869, + "grad_norm": 0.8158044815063477, + "learning_rate": 9.903181259956161e-05, + "loss": 0.1123, + "step": 19730 + }, + { + "epoch": 1.2914622178606476, + "grad_norm": 0.7874143123626709, + "learning_rate": 9.903001281485937e-05, + "loss": 0.0954, + "step": 19740 + }, + { + "epoch": 1.2921164540399084, + "grad_norm": 1.0694737434387207, + "learning_rate": 9.902821137526564e-05, + "loss": 0.1028, + "step": 19750 + }, + { + "epoch": 1.2927706902191691, + "grad_norm": 0.9189092516899109, + "learning_rate": 9.902640828084118e-05, + "loss": 0.1016, + "step": 19760 + }, + { + "epoch": 1.2934249263984299, + "grad_norm": 0.7503056526184082, + "learning_rate": 9.902460353164687e-05, + "loss": 0.0958, + "step": 19770 + }, + { + "epoch": 1.2940791625776906, + "grad_norm": 1.0281898975372314, + "learning_rate": 9.90227971277436e-05, + "loss": 0.0997, + "step": 19780 + }, + { + "epoch": 1.2947333987569514, + "grad_norm": 0.817094624042511, + "learning_rate": 9.902098906919239e-05, + "loss": 0.1076, + "step": 19790 + }, + { + "epoch": 1.295387634936212, + "grad_norm": 0.7925615906715393, + "learning_rate": 9.901917935605423e-05, + "loss": 0.0986, + "step": 19800 + }, + { + "epoch": 1.2960418711154726, + "grad_norm": 0.8204017877578735, + "learning_rate": 9.901736798839018e-05, + "loss": 0.0976, + "step": 19810 + }, + { + "epoch": 1.2966961072947334, + "grad_norm": 0.7727102637290955, + "learning_rate": 9.901555496626145e-05, + "loss": 0.0922, + "step": 19820 + }, + { + "epoch": 1.2973503434739941, + "grad_norm": 0.9235280156135559, + "learning_rate": 9.901374028972916e-05, + "loss": 0.1103, + "step": 19830 + }, + { + "epoch": 1.2980045796532549, + "grad_norm": 0.8759993314743042, + "learning_rate": 9.901192395885461e-05, + "loss": 0.0985, + "step": 19840 + }, + { + "epoch": 1.2986588158325154, + "grad_norm": 0.7741413116455078, + "learning_rate": 9.901010597369907e-05, + "loss": 0.0985, + "step": 19850 + }, + { + "epoch": 1.2993130520117764, + "grad_norm": 0.8373982906341553, + "learning_rate": 9.900828633432393e-05, + "loss": 0.1054, + "step": 19860 + }, + { + "epoch": 1.299967288191037, + "grad_norm": 0.708233654499054, + "learning_rate": 9.90064650407906e-05, + "loss": 0.1, + "step": 19870 + }, + { + "epoch": 1.3006215243702977, + "grad_norm": 0.8219735026359558, + "learning_rate": 9.900464209316054e-05, + "loss": 0.1052, + "step": 19880 + }, + { + "epoch": 1.3012757605495584, + "grad_norm": 0.7485021352767944, + "learning_rate": 9.900281749149531e-05, + "loss": 0.0986, + "step": 19890 + }, + { + "epoch": 1.3019299967288191, + "grad_norm": 0.8410215973854065, + "learning_rate": 9.900099123585646e-05, + "loss": 0.1012, + "step": 19900 + }, + { + "epoch": 1.30258423290808, + "grad_norm": 0.9873031377792358, + "learning_rate": 9.899916332630565e-05, + "loss": 0.1031, + "step": 19910 + }, + { + "epoch": 1.3032384690873404, + "grad_norm": 0.7580830454826355, + "learning_rate": 9.899733376290458e-05, + "loss": 0.1084, + "step": 19920 + }, + { + "epoch": 1.3038927052666012, + "grad_norm": 0.8297830820083618, + "learning_rate": 9.899550254571499e-05, + "loss": 0.1034, + "step": 19930 + }, + { + "epoch": 1.304546941445862, + "grad_norm": 0.888883113861084, + "learning_rate": 9.899366967479868e-05, + "loss": 0.0951, + "step": 19940 + }, + { + "epoch": 1.3052011776251227, + "grad_norm": 0.7784373760223389, + "learning_rate": 9.899183515021755e-05, + "loss": 0.107, + "step": 19950 + }, + { + "epoch": 1.3058554138043834, + "grad_norm": 0.7872931361198425, + "learning_rate": 9.898999897203347e-05, + "loss": 0.0941, + "step": 19960 + }, + { + "epoch": 1.3065096499836442, + "grad_norm": 0.8830298781394958, + "learning_rate": 9.898816114030846e-05, + "loss": 0.1035, + "step": 19970 + }, + { + "epoch": 1.307163886162905, + "grad_norm": 0.8231805562973022, + "learning_rate": 9.898632165510455e-05, + "loss": 0.1061, + "step": 19980 + }, + { + "epoch": 1.3078181223421654, + "grad_norm": 0.8644198179244995, + "learning_rate": 9.89844805164838e-05, + "loss": 0.1068, + "step": 19990 + }, + { + "epoch": 1.3084723585214262, + "grad_norm": 0.7423153519630432, + "learning_rate": 9.898263772450836e-05, + "loss": 0.102, + "step": 20000 + }, + { + "epoch": 1.309126594700687, + "grad_norm": 0.9147112965583801, + "learning_rate": 9.898079327924044e-05, + "loss": 0.1079, + "step": 20010 + }, + { + "epoch": 1.3097808308799477, + "grad_norm": 0.8351669907569885, + "learning_rate": 9.897894718074229e-05, + "loss": 0.1015, + "step": 20020 + }, + { + "epoch": 1.3104350670592084, + "grad_norm": 0.9719964861869812, + "learning_rate": 9.897709942907623e-05, + "loss": 0.096, + "step": 20030 + }, + { + "epoch": 1.311089303238469, + "grad_norm": 1.0228232145309448, + "learning_rate": 9.897525002430458e-05, + "loss": 0.1035, + "step": 20040 + }, + { + "epoch": 1.31174353941773, + "grad_norm": 0.8931130766868591, + "learning_rate": 9.897339896648985e-05, + "loss": 0.1063, + "step": 20050 + }, + { + "epoch": 1.3123977755969904, + "grad_norm": 0.992167592048645, + "learning_rate": 9.897154625569443e-05, + "loss": 0.0942, + "step": 20060 + }, + { + "epoch": 1.3130520117762512, + "grad_norm": 0.7140622735023499, + "learning_rate": 9.89696918919809e-05, + "loss": 0.0967, + "step": 20070 + }, + { + "epoch": 1.313706247955512, + "grad_norm": 0.7795881032943726, + "learning_rate": 9.896783587541186e-05, + "loss": 0.105, + "step": 20080 + }, + { + "epoch": 1.3143604841347727, + "grad_norm": 0.8552742004394531, + "learning_rate": 9.896597820604992e-05, + "loss": 0.0931, + "step": 20090 + }, + { + "epoch": 1.3150147203140334, + "grad_norm": 0.7586219310760498, + "learning_rate": 9.89641188839578e-05, + "loss": 0.1137, + "step": 20100 + }, + { + "epoch": 1.315668956493294, + "grad_norm": 0.9405565857887268, + "learning_rate": 9.896225790919826e-05, + "loss": 0.1114, + "step": 20110 + }, + { + "epoch": 1.3163231926725547, + "grad_norm": 0.929673969745636, + "learning_rate": 9.89603952818341e-05, + "loss": 0.0999, + "step": 20120 + }, + { + "epoch": 1.3169774288518155, + "grad_norm": 0.8156664371490479, + "learning_rate": 9.89585310019282e-05, + "loss": 0.1036, + "step": 20130 + }, + { + "epoch": 1.3176316650310762, + "grad_norm": 0.9651734232902527, + "learning_rate": 9.895666506954347e-05, + "loss": 0.104, + "step": 20140 + }, + { + "epoch": 1.318285901210337, + "grad_norm": 1.05132257938385, + "learning_rate": 9.895479748474292e-05, + "loss": 0.1016, + "step": 20150 + }, + { + "epoch": 1.3189401373895977, + "grad_norm": 0.8636537790298462, + "learning_rate": 9.895292824758955e-05, + "loss": 0.1022, + "step": 20160 + }, + { + "epoch": 1.3195943735688584, + "grad_norm": 0.7908622026443481, + "learning_rate": 9.895105735814647e-05, + "loss": 0.1048, + "step": 20170 + }, + { + "epoch": 1.320248609748119, + "grad_norm": 0.8223440051078796, + "learning_rate": 9.894918481647684e-05, + "loss": 0.0958, + "step": 20180 + }, + { + "epoch": 1.3209028459273797, + "grad_norm": 0.9097592234611511, + "learning_rate": 9.894731062264383e-05, + "loss": 0.1036, + "step": 20190 + }, + { + "epoch": 1.3215570821066405, + "grad_norm": 0.8000107407569885, + "learning_rate": 9.894543477671072e-05, + "loss": 0.0993, + "step": 20200 + }, + { + "epoch": 1.3222113182859012, + "grad_norm": 0.8129693865776062, + "learning_rate": 9.894355727874083e-05, + "loss": 0.113, + "step": 20210 + }, + { + "epoch": 1.322865554465162, + "grad_norm": 0.7431675791740417, + "learning_rate": 9.894167812879751e-05, + "loss": 0.1008, + "step": 20220 + }, + { + "epoch": 1.3235197906444227, + "grad_norm": 1.0079487562179565, + "learning_rate": 9.893979732694421e-05, + "loss": 0.1075, + "step": 20230 + }, + { + "epoch": 1.3241740268236835, + "grad_norm": 0.7943064570426941, + "learning_rate": 9.89379148732444e-05, + "loss": 0.1154, + "step": 20240 + }, + { + "epoch": 1.324828263002944, + "grad_norm": 0.7072292566299438, + "learning_rate": 9.893603076776162e-05, + "loss": 0.113, + "step": 20250 + }, + { + "epoch": 1.3254824991822047, + "grad_norm": 0.8471561074256897, + "learning_rate": 9.893414501055947e-05, + "loss": 0.1068, + "step": 20260 + }, + { + "epoch": 1.3261367353614655, + "grad_norm": 0.842134416103363, + "learning_rate": 9.893225760170157e-05, + "loss": 0.1066, + "step": 20270 + }, + { + "epoch": 1.3267909715407262, + "grad_norm": 0.836654543876648, + "learning_rate": 9.893036854125166e-05, + "loss": 0.099, + "step": 20280 + }, + { + "epoch": 1.327445207719987, + "grad_norm": 0.8249868750572205, + "learning_rate": 9.892847782927348e-05, + "loss": 0.0977, + "step": 20290 + }, + { + "epoch": 1.3280994438992475, + "grad_norm": 0.7410039305686951, + "learning_rate": 9.892658546583087e-05, + "loss": 0.114, + "step": 20300 + }, + { + "epoch": 1.3287536800785085, + "grad_norm": 0.7557688355445862, + "learning_rate": 9.892469145098766e-05, + "loss": 0.0889, + "step": 20310 + }, + { + "epoch": 1.329407916257769, + "grad_norm": 1.1154882907867432, + "learning_rate": 9.892279578480783e-05, + "loss": 0.0977, + "step": 20320 + }, + { + "epoch": 1.3300621524370297, + "grad_norm": 1.0034886598587036, + "learning_rate": 9.892089846735533e-05, + "loss": 0.0942, + "step": 20330 + }, + { + "epoch": 1.3307163886162905, + "grad_norm": 0.837894082069397, + "learning_rate": 9.891899949869418e-05, + "loss": 0.1187, + "step": 20340 + }, + { + "epoch": 1.3313706247955512, + "grad_norm": 0.8454033732414246, + "learning_rate": 9.891709887888853e-05, + "loss": 0.1037, + "step": 20350 + }, + { + "epoch": 1.332024860974812, + "grad_norm": 0.8291163444519043, + "learning_rate": 9.89151966080025e-05, + "loss": 0.1154, + "step": 20360 + }, + { + "epoch": 1.3326790971540725, + "grad_norm": 0.9798288941383362, + "learning_rate": 9.89132926861003e-05, + "loss": 0.1007, + "step": 20370 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 1.067610740661621, + "learning_rate": 9.891138711324619e-05, + "loss": 0.0952, + "step": 20380 + }, + { + "epoch": 1.333987569512594, + "grad_norm": 0.8580688834190369, + "learning_rate": 9.89094798895045e-05, + "loss": 0.0965, + "step": 20390 + }, + { + "epoch": 1.3346418056918548, + "grad_norm": 0.9280464053153992, + "learning_rate": 9.890757101493958e-05, + "loss": 0.1037, + "step": 20400 + }, + { + "epoch": 1.3352960418711155, + "grad_norm": 0.8157536387443542, + "learning_rate": 9.890566048961587e-05, + "loss": 0.097, + "step": 20410 + }, + { + "epoch": 1.3359502780503762, + "grad_norm": 0.8738055229187012, + "learning_rate": 9.890374831359787e-05, + "loss": 0.0991, + "step": 20420 + }, + { + "epoch": 1.336604514229637, + "grad_norm": 0.9500564932823181, + "learning_rate": 9.89018344869501e-05, + "loss": 0.1093, + "step": 20430 + }, + { + "epoch": 1.3372587504088975, + "grad_norm": 0.8507956266403198, + "learning_rate": 9.889991900973717e-05, + "loss": 0.1148, + "step": 20440 + }, + { + "epoch": 1.3379129865881583, + "grad_norm": 0.8578815460205078, + "learning_rate": 9.889800188202374e-05, + "loss": 0.1032, + "step": 20450 + }, + { + "epoch": 1.338567222767419, + "grad_norm": 0.8666653633117676, + "learning_rate": 9.889608310387449e-05, + "loss": 0.1048, + "step": 20460 + }, + { + "epoch": 1.3392214589466798, + "grad_norm": 0.7998878359794617, + "learning_rate": 9.88941626753542e-05, + "loss": 0.103, + "step": 20470 + }, + { + "epoch": 1.3398756951259405, + "grad_norm": 0.8280777931213379, + "learning_rate": 9.889224059652771e-05, + "loss": 0.107, + "step": 20480 + }, + { + "epoch": 1.340529931305201, + "grad_norm": 0.9405080676078796, + "learning_rate": 9.889031686745987e-05, + "loss": 0.1129, + "step": 20490 + }, + { + "epoch": 1.341184167484462, + "grad_norm": 1.0205219984054565, + "learning_rate": 9.88883914882156e-05, + "loss": 0.101, + "step": 20500 + }, + { + "epoch": 1.3418384036637225, + "grad_norm": 0.8715194463729858, + "learning_rate": 9.888646445885991e-05, + "loss": 0.1148, + "step": 20510 + }, + { + "epoch": 1.3424926398429833, + "grad_norm": 0.9229245185852051, + "learning_rate": 9.888453577945784e-05, + "loss": 0.0971, + "step": 20520 + }, + { + "epoch": 1.343146876022244, + "grad_norm": 0.9182971119880676, + "learning_rate": 9.888260545007448e-05, + "loss": 0.0908, + "step": 20530 + }, + { + "epoch": 1.3438011122015048, + "grad_norm": 0.9435850977897644, + "learning_rate": 9.888067347077499e-05, + "loss": 0.1113, + "step": 20540 + }, + { + "epoch": 1.3444553483807655, + "grad_norm": 0.9361865520477295, + "learning_rate": 9.887873984162457e-05, + "loss": 0.1119, + "step": 20550 + }, + { + "epoch": 1.345109584560026, + "grad_norm": 0.9154676198959351, + "learning_rate": 9.887680456268848e-05, + "loss": 0.1064, + "step": 20560 + }, + { + "epoch": 1.345763820739287, + "grad_norm": 1.0098940134048462, + "learning_rate": 9.887486763403207e-05, + "loss": 0.0951, + "step": 20570 + }, + { + "epoch": 1.3464180569185475, + "grad_norm": 0.7353399991989136, + "learning_rate": 9.88729290557207e-05, + "loss": 0.0975, + "step": 20580 + }, + { + "epoch": 1.3470722930978083, + "grad_norm": 0.7690742611885071, + "learning_rate": 9.88709888278198e-05, + "loss": 0.0931, + "step": 20590 + }, + { + "epoch": 1.347726529277069, + "grad_norm": 0.780418872833252, + "learning_rate": 9.886904695039484e-05, + "loss": 0.0957, + "step": 20600 + }, + { + "epoch": 1.3483807654563298, + "grad_norm": 0.817748486995697, + "learning_rate": 9.88671034235114e-05, + "loss": 0.1011, + "step": 20610 + }, + { + "epoch": 1.3490350016355905, + "grad_norm": 0.9198586940765381, + "learning_rate": 9.886515824723505e-05, + "loss": 0.0976, + "step": 20620 + }, + { + "epoch": 1.349689237814851, + "grad_norm": 0.8738722205162048, + "learning_rate": 9.886321142163147e-05, + "loss": 0.1132, + "step": 20630 + }, + { + "epoch": 1.3503434739941118, + "grad_norm": 0.7624223828315735, + "learning_rate": 9.886126294676634e-05, + "loss": 0.1022, + "step": 20640 + }, + { + "epoch": 1.3509977101733726, + "grad_norm": 0.9689080715179443, + "learning_rate": 9.885931282270545e-05, + "loss": 0.1014, + "step": 20650 + }, + { + "epoch": 1.3516519463526333, + "grad_norm": 0.7586194276809692, + "learning_rate": 9.885736104951462e-05, + "loss": 0.0962, + "step": 20660 + }, + { + "epoch": 1.352306182531894, + "grad_norm": 0.8439247012138367, + "learning_rate": 9.88554076272597e-05, + "loss": 0.0948, + "step": 20670 + }, + { + "epoch": 1.3529604187111548, + "grad_norm": 1.0176705121994019, + "learning_rate": 9.885345255600666e-05, + "loss": 0.1043, + "step": 20680 + }, + { + "epoch": 1.3536146548904155, + "grad_norm": 0.753044843673706, + "learning_rate": 9.885149583582148e-05, + "loss": 0.0954, + "step": 20690 + }, + { + "epoch": 1.354268891069676, + "grad_norm": 0.9865116477012634, + "learning_rate": 9.884953746677019e-05, + "loss": 0.1019, + "step": 20700 + }, + { + "epoch": 1.3549231272489368, + "grad_norm": 0.8925600051879883, + "learning_rate": 9.88475774489189e-05, + "loss": 0.1057, + "step": 20710 + }, + { + "epoch": 1.3555773634281976, + "grad_norm": 0.8571544289588928, + "learning_rate": 9.884561578233375e-05, + "loss": 0.1052, + "step": 20720 + }, + { + "epoch": 1.3562315996074583, + "grad_norm": 0.7800583839416504, + "learning_rate": 9.884365246708098e-05, + "loss": 0.0942, + "step": 20730 + }, + { + "epoch": 1.356885835786719, + "grad_norm": 0.9630329608917236, + "learning_rate": 9.884168750322684e-05, + "loss": 0.1151, + "step": 20740 + }, + { + "epoch": 1.3575400719659796, + "grad_norm": 1.2140957117080688, + "learning_rate": 9.883972089083766e-05, + "loss": 0.1038, + "step": 20750 + }, + { + "epoch": 1.3581943081452406, + "grad_norm": 0.7877089977264404, + "learning_rate": 9.883775262997981e-05, + "loss": 0.0968, + "step": 20760 + }, + { + "epoch": 1.358848544324501, + "grad_norm": 0.9409202933311462, + "learning_rate": 9.883578272071971e-05, + "loss": 0.1003, + "step": 20770 + }, + { + "epoch": 1.3595027805037618, + "grad_norm": 1.0525962114334106, + "learning_rate": 9.883381116312389e-05, + "loss": 0.0987, + "step": 20780 + }, + { + "epoch": 1.3601570166830226, + "grad_norm": 0.8905682563781738, + "learning_rate": 9.883183795725885e-05, + "loss": 0.0976, + "step": 20790 + }, + { + "epoch": 1.3608112528622833, + "grad_norm": 0.860859215259552, + "learning_rate": 9.882986310319124e-05, + "loss": 0.1083, + "step": 20800 + }, + { + "epoch": 1.361465489041544, + "grad_norm": 1.0243405103683472, + "learning_rate": 9.882788660098768e-05, + "loss": 0.0915, + "step": 20810 + }, + { + "epoch": 1.3621197252208046, + "grad_norm": 1.0279419422149658, + "learning_rate": 9.882590845071487e-05, + "loss": 0.1119, + "step": 20820 + }, + { + "epoch": 1.3627739614000653, + "grad_norm": 1.0260818004608154, + "learning_rate": 9.882392865243961e-05, + "loss": 0.105, + "step": 20830 + }, + { + "epoch": 1.363428197579326, + "grad_norm": 0.9079784750938416, + "learning_rate": 9.882194720622873e-05, + "loss": 0.0935, + "step": 20840 + }, + { + "epoch": 1.3640824337585868, + "grad_norm": 0.8685601949691772, + "learning_rate": 9.881996411214906e-05, + "loss": 0.0976, + "step": 20850 + }, + { + "epoch": 1.3647366699378476, + "grad_norm": 0.7832821011543274, + "learning_rate": 9.88179793702676e-05, + "loss": 0.1072, + "step": 20860 + }, + { + "epoch": 1.3653909061171083, + "grad_norm": 0.6875680685043335, + "learning_rate": 9.88159929806513e-05, + "loss": 0.0963, + "step": 20870 + }, + { + "epoch": 1.366045142296369, + "grad_norm": 0.8509207367897034, + "learning_rate": 9.881400494336719e-05, + "loss": 0.1061, + "step": 20880 + }, + { + "epoch": 1.3666993784756296, + "grad_norm": 0.7752100229263306, + "learning_rate": 9.88120152584824e-05, + "loss": 0.1041, + "step": 20890 + }, + { + "epoch": 1.3673536146548904, + "grad_norm": 0.7963639497756958, + "learning_rate": 9.88100239260641e-05, + "loss": 0.1042, + "step": 20900 + }, + { + "epoch": 1.368007850834151, + "grad_norm": 0.8341352939605713, + "learning_rate": 9.880803094617948e-05, + "loss": 0.0957, + "step": 20910 + }, + { + "epoch": 1.3686620870134119, + "grad_norm": 0.7604305744171143, + "learning_rate": 9.88060363188958e-05, + "loss": 0.1033, + "step": 20920 + }, + { + "epoch": 1.3693163231926726, + "grad_norm": 0.8400859236717224, + "learning_rate": 9.880404004428039e-05, + "loss": 0.0955, + "step": 20930 + }, + { + "epoch": 1.3699705593719333, + "grad_norm": 0.8343890905380249, + "learning_rate": 9.880204212240065e-05, + "loss": 0.0925, + "step": 20940 + }, + { + "epoch": 1.370624795551194, + "grad_norm": 0.9283037781715393, + "learning_rate": 9.880004255332399e-05, + "loss": 0.1094, + "step": 20950 + }, + { + "epoch": 1.3712790317304546, + "grad_norm": 1.0284000635147095, + "learning_rate": 9.879804133711792e-05, + "loss": 0.0951, + "step": 20960 + }, + { + "epoch": 1.3719332679097154, + "grad_norm": 1.0388516187667847, + "learning_rate": 9.879603847384997e-05, + "loss": 0.106, + "step": 20970 + }, + { + "epoch": 1.3725875040889761, + "grad_norm": 0.9273998141288757, + "learning_rate": 9.879403396358775e-05, + "loss": 0.0964, + "step": 20980 + }, + { + "epoch": 1.3732417402682369, + "grad_norm": 0.8812925815582275, + "learning_rate": 9.879202780639892e-05, + "loss": 0.1022, + "step": 20990 + }, + { + "epoch": 1.3738959764474976, + "grad_norm": 0.8167784214019775, + "learning_rate": 9.87900200023512e-05, + "loss": 0.113, + "step": 21000 + }, + { + "epoch": 1.3745502126267581, + "grad_norm": 0.961060106754303, + "learning_rate": 9.878801055151232e-05, + "loss": 0.0984, + "step": 21010 + }, + { + "epoch": 1.375204448806019, + "grad_norm": 1.0047707557678223, + "learning_rate": 9.878599945395015e-05, + "loss": 0.0912, + "step": 21020 + }, + { + "epoch": 1.3758586849852796, + "grad_norm": 1.0364177227020264, + "learning_rate": 9.878398670973256e-05, + "loss": 0.1161, + "step": 21030 + }, + { + "epoch": 1.3765129211645404, + "grad_norm": 0.7439346313476562, + "learning_rate": 9.878197231892747e-05, + "loss": 0.1047, + "step": 21040 + }, + { + "epoch": 1.3771671573438011, + "grad_norm": 0.9560453295707703, + "learning_rate": 9.877995628160288e-05, + "loss": 0.0911, + "step": 21050 + }, + { + "epoch": 1.3778213935230619, + "grad_norm": 0.8289201855659485, + "learning_rate": 9.877793859782683e-05, + "loss": 0.092, + "step": 21060 + }, + { + "epoch": 1.3784756297023226, + "grad_norm": 0.9151121377944946, + "learning_rate": 9.877591926766743e-05, + "loss": 0.0996, + "step": 21070 + }, + { + "epoch": 1.3791298658815832, + "grad_norm": 0.8370741605758667, + "learning_rate": 9.877389829119284e-05, + "loss": 0.0936, + "step": 21080 + }, + { + "epoch": 1.379784102060844, + "grad_norm": 0.9695648550987244, + "learning_rate": 9.877187566847125e-05, + "loss": 0.0963, + "step": 21090 + }, + { + "epoch": 1.3804383382401046, + "grad_norm": 0.9655565619468689, + "learning_rate": 9.876985139957098e-05, + "loss": 0.1032, + "step": 21100 + }, + { + "epoch": 1.3810925744193654, + "grad_norm": 0.9897701144218445, + "learning_rate": 9.876782548456029e-05, + "loss": 0.1066, + "step": 21110 + }, + { + "epoch": 1.3817468105986261, + "grad_norm": 1.0255415439605713, + "learning_rate": 9.876579792350762e-05, + "loss": 0.1065, + "step": 21120 + }, + { + "epoch": 1.3824010467778869, + "grad_norm": 0.9819886684417725, + "learning_rate": 9.876376871648137e-05, + "loss": 0.0994, + "step": 21130 + }, + { + "epoch": 1.3830552829571476, + "grad_norm": 0.9188646078109741, + "learning_rate": 9.876173786355003e-05, + "loss": 0.107, + "step": 21140 + }, + { + "epoch": 1.3837095191364082, + "grad_norm": 0.8057947158813477, + "learning_rate": 9.875970536478213e-05, + "loss": 0.0964, + "step": 21150 + }, + { + "epoch": 1.384363755315669, + "grad_norm": 0.8052737712860107, + "learning_rate": 9.875767122024634e-05, + "loss": 0.1005, + "step": 21160 + }, + { + "epoch": 1.3850179914949297, + "grad_norm": 0.8417797684669495, + "learning_rate": 9.875563543001125e-05, + "loss": 0.0942, + "step": 21170 + }, + { + "epoch": 1.3856722276741904, + "grad_norm": 1.070704698562622, + "learning_rate": 9.875359799414561e-05, + "loss": 0.099, + "step": 21180 + }, + { + "epoch": 1.3863264638534512, + "grad_norm": 0.871180534362793, + "learning_rate": 9.875155891271817e-05, + "loss": 0.0993, + "step": 21190 + }, + { + "epoch": 1.3869807000327117, + "grad_norm": 0.9643855690956116, + "learning_rate": 9.874951818579776e-05, + "loss": 0.1023, + "step": 21200 + }, + { + "epoch": 1.3876349362119726, + "grad_norm": 0.8685986399650574, + "learning_rate": 9.874747581345328e-05, + "loss": 0.0985, + "step": 21210 + }, + { + "epoch": 1.3882891723912332, + "grad_norm": 0.7610760927200317, + "learning_rate": 9.874543179575362e-05, + "loss": 0.0978, + "step": 21220 + }, + { + "epoch": 1.388943408570494, + "grad_norm": 0.7402287721633911, + "learning_rate": 9.874338613276781e-05, + "loss": 0.1047, + "step": 21230 + }, + { + "epoch": 1.3895976447497547, + "grad_norm": 0.8294671773910522, + "learning_rate": 9.874133882456489e-05, + "loss": 0.0976, + "step": 21240 + }, + { + "epoch": 1.3902518809290154, + "grad_norm": 1.0377382040023804, + "learning_rate": 9.873928987121394e-05, + "loss": 0.1077, + "step": 21250 + }, + { + "epoch": 1.3909061171082762, + "grad_norm": 0.8378622531890869, + "learning_rate": 9.873723927278414e-05, + "loss": 0.0988, + "step": 21260 + }, + { + "epoch": 1.3915603532875367, + "grad_norm": 0.7670599222183228, + "learning_rate": 9.87351870293447e-05, + "loss": 0.1022, + "step": 21270 + }, + { + "epoch": 1.3922145894667974, + "grad_norm": 0.7315048575401306, + "learning_rate": 9.873313314096488e-05, + "loss": 0.0977, + "step": 21280 + }, + { + "epoch": 1.3928688256460582, + "grad_norm": 0.960702121257782, + "learning_rate": 9.873107760771401e-05, + "loss": 0.0966, + "step": 21290 + }, + { + "epoch": 1.393523061825319, + "grad_norm": 0.7483635544776917, + "learning_rate": 9.872902042966147e-05, + "loss": 0.0883, + "step": 21300 + }, + { + "epoch": 1.3941772980045797, + "grad_norm": 0.8449085354804993, + "learning_rate": 9.872696160687669e-05, + "loss": 0.1002, + "step": 21310 + }, + { + "epoch": 1.3948315341838404, + "grad_norm": 0.8629240393638611, + "learning_rate": 9.872490113942918e-05, + "loss": 0.0977, + "step": 21320 + }, + { + "epoch": 1.3954857703631012, + "grad_norm": 0.8286533951759338, + "learning_rate": 9.872283902738845e-05, + "loss": 0.0968, + "step": 21330 + }, + { + "epoch": 1.3961400065423617, + "grad_norm": 0.8153654932975769, + "learning_rate": 9.872077527082413e-05, + "loss": 0.1113, + "step": 21340 + }, + { + "epoch": 1.3967942427216224, + "grad_norm": 0.7683613896369934, + "learning_rate": 9.871870986980587e-05, + "loss": 0.0887, + "step": 21350 + }, + { + "epoch": 1.3974484789008832, + "grad_norm": 1.0355619192123413, + "learning_rate": 9.871664282440339e-05, + "loss": 0.1074, + "step": 21360 + }, + { + "epoch": 1.398102715080144, + "grad_norm": 0.8770645260810852, + "learning_rate": 9.871457413468644e-05, + "loss": 0.0993, + "step": 21370 + }, + { + "epoch": 1.3987569512594047, + "grad_norm": 0.709586501121521, + "learning_rate": 9.871250380072487e-05, + "loss": 0.094, + "step": 21380 + }, + { + "epoch": 1.3994111874386654, + "grad_norm": 0.7333911061286926, + "learning_rate": 9.871043182258852e-05, + "loss": 0.1037, + "step": 21390 + }, + { + "epoch": 1.4000654236179262, + "grad_norm": 0.7974872589111328, + "learning_rate": 9.870835820034736e-05, + "loss": 0.0953, + "step": 21400 + }, + { + "epoch": 1.4007196597971867, + "grad_norm": 0.8069256544113159, + "learning_rate": 9.870628293407138e-05, + "loss": 0.0922, + "step": 21410 + }, + { + "epoch": 1.4013738959764475, + "grad_norm": 0.8490813374519348, + "learning_rate": 9.870420602383059e-05, + "loss": 0.1018, + "step": 21420 + }, + { + "epoch": 1.4020281321557082, + "grad_norm": 0.9025737643241882, + "learning_rate": 9.870212746969514e-05, + "loss": 0.0992, + "step": 21430 + }, + { + "epoch": 1.402682368334969, + "grad_norm": 0.7554858922958374, + "learning_rate": 9.870004727173514e-05, + "loss": 0.1055, + "step": 21440 + }, + { + "epoch": 1.4033366045142297, + "grad_norm": 0.9373718500137329, + "learning_rate": 9.869796543002083e-05, + "loss": 0.1052, + "step": 21450 + }, + { + "epoch": 1.4039908406934902, + "grad_norm": 0.916695237159729, + "learning_rate": 9.869588194462249e-05, + "loss": 0.103, + "step": 21460 + }, + { + "epoch": 1.4046450768727512, + "grad_norm": 1.047790288925171, + "learning_rate": 9.869379681561041e-05, + "loss": 0.0934, + "step": 21470 + }, + { + "epoch": 1.4052993130520117, + "grad_norm": 0.8690340518951416, + "learning_rate": 9.869171004305497e-05, + "loss": 0.0933, + "step": 21480 + }, + { + "epoch": 1.4059535492312725, + "grad_norm": 0.8632329106330872, + "learning_rate": 9.868962162702664e-05, + "loss": 0.1004, + "step": 21490 + }, + { + "epoch": 1.4066077854105332, + "grad_norm": 0.8703600168228149, + "learning_rate": 9.868753156759587e-05, + "loss": 0.0961, + "step": 21500 + }, + { + "epoch": 1.407262021589794, + "grad_norm": 0.9060193300247192, + "learning_rate": 9.868543986483325e-05, + "loss": 0.1021, + "step": 21510 + }, + { + "epoch": 1.4079162577690547, + "grad_norm": 1.0863100290298462, + "learning_rate": 9.868334651880932e-05, + "loss": 0.1095, + "step": 21520 + }, + { + "epoch": 1.4085704939483152, + "grad_norm": 0.837095320224762, + "learning_rate": 9.868125152959477e-05, + "loss": 0.0981, + "step": 21530 + }, + { + "epoch": 1.409224730127576, + "grad_norm": 1.0157208442687988, + "learning_rate": 9.867915489726034e-05, + "loss": 0.102, + "step": 21540 + }, + { + "epoch": 1.4098789663068367, + "grad_norm": 0.9771285057067871, + "learning_rate": 9.867705662187673e-05, + "loss": 0.1063, + "step": 21550 + }, + { + "epoch": 1.4105332024860975, + "grad_norm": 0.9457305669784546, + "learning_rate": 9.867495670351483e-05, + "loss": 0.094, + "step": 21560 + }, + { + "epoch": 1.4111874386653582, + "grad_norm": 0.90764981508255, + "learning_rate": 9.867285514224547e-05, + "loss": 0.0918, + "step": 21570 + }, + { + "epoch": 1.411841674844619, + "grad_norm": 0.8110638856887817, + "learning_rate": 9.867075193813959e-05, + "loss": 0.1003, + "step": 21580 + }, + { + "epoch": 1.4124959110238797, + "grad_norm": 1.1416128873825073, + "learning_rate": 9.866864709126821e-05, + "loss": 0.1112, + "step": 21590 + }, + { + "epoch": 1.4131501472031402, + "grad_norm": 1.088036060333252, + "learning_rate": 9.866654060170234e-05, + "loss": 0.1099, + "step": 21600 + }, + { + "epoch": 1.413804383382401, + "grad_norm": 0.9464057087898254, + "learning_rate": 9.866443246951308e-05, + "loss": 0.0975, + "step": 21610 + }, + { + "epoch": 1.4144586195616617, + "grad_norm": 0.8685123324394226, + "learning_rate": 9.866232269477162e-05, + "loss": 0.0942, + "step": 21620 + }, + { + "epoch": 1.4151128557409225, + "grad_norm": 0.7753238677978516, + "learning_rate": 9.866021127754915e-05, + "loss": 0.0973, + "step": 21630 + }, + { + "epoch": 1.4157670919201832, + "grad_norm": 0.8483602404594421, + "learning_rate": 9.865809821791692e-05, + "loss": 0.1016, + "step": 21640 + }, + { + "epoch": 1.4164213280994438, + "grad_norm": 0.712035596370697, + "learning_rate": 9.865598351594627e-05, + "loss": 0.0985, + "step": 21650 + }, + { + "epoch": 1.4170755642787047, + "grad_norm": 0.87371426820755, + "learning_rate": 9.865386717170856e-05, + "loss": 0.0952, + "step": 21660 + }, + { + "epoch": 1.4177298004579653, + "grad_norm": 1.058411717414856, + "learning_rate": 9.865174918527525e-05, + "loss": 0.0956, + "step": 21670 + }, + { + "epoch": 1.418384036637226, + "grad_norm": 0.7664424777030945, + "learning_rate": 9.864962955671779e-05, + "loss": 0.0889, + "step": 21680 + }, + { + "epoch": 1.4190382728164868, + "grad_norm": 0.9568914771080017, + "learning_rate": 9.864750828610776e-05, + "loss": 0.102, + "step": 21690 + }, + { + "epoch": 1.4196925089957475, + "grad_norm": 0.8429945707321167, + "learning_rate": 9.864538537351675e-05, + "loss": 0.1018, + "step": 21700 + }, + { + "epoch": 1.4203467451750083, + "grad_norm": 1.3158056735992432, + "learning_rate": 9.864326081901639e-05, + "loss": 0.0934, + "step": 21710 + }, + { + "epoch": 1.4210009813542688, + "grad_norm": 0.8673258423805237, + "learning_rate": 9.864113462267841e-05, + "loss": 0.0917, + "step": 21720 + }, + { + "epoch": 1.4216552175335295, + "grad_norm": 1.0829505920410156, + "learning_rate": 9.863900678457457e-05, + "loss": 0.1176, + "step": 21730 + }, + { + "epoch": 1.4223094537127903, + "grad_norm": 0.9344543218612671, + "learning_rate": 9.86368773047767e-05, + "loss": 0.0962, + "step": 21740 + }, + { + "epoch": 1.422963689892051, + "grad_norm": 0.9904654026031494, + "learning_rate": 9.863474618335666e-05, + "loss": 0.1118, + "step": 21750 + }, + { + "epoch": 1.4236179260713118, + "grad_norm": 0.864806592464447, + "learning_rate": 9.863261342038639e-05, + "loss": 0.0914, + "step": 21760 + }, + { + "epoch": 1.4242721622505725, + "grad_norm": 0.8543987274169922, + "learning_rate": 9.863047901593786e-05, + "loss": 0.1041, + "step": 21770 + }, + { + "epoch": 1.4249263984298333, + "grad_norm": 0.8456289172172546, + "learning_rate": 9.862834297008314e-05, + "loss": 0.1148, + "step": 21780 + }, + { + "epoch": 1.4255806346090938, + "grad_norm": 0.9336904287338257, + "learning_rate": 9.862620528289431e-05, + "loss": 0.1032, + "step": 21790 + }, + { + "epoch": 1.4262348707883545, + "grad_norm": 0.9453654885292053, + "learning_rate": 9.862406595444351e-05, + "loss": 0.1059, + "step": 21800 + }, + { + "epoch": 1.4268891069676153, + "grad_norm": 1.0138676166534424, + "learning_rate": 9.862192498480299e-05, + "loss": 0.0979, + "step": 21810 + }, + { + "epoch": 1.427543343146876, + "grad_norm": 1.177456259727478, + "learning_rate": 9.861978237404496e-05, + "loss": 0.0932, + "step": 21820 + }, + { + "epoch": 1.4281975793261368, + "grad_norm": 0.792131781578064, + "learning_rate": 9.861763812224177e-05, + "loss": 0.0958, + "step": 21830 + }, + { + "epoch": 1.4288518155053975, + "grad_norm": 0.7728269696235657, + "learning_rate": 9.86154922294658e-05, + "loss": 0.0975, + "step": 21840 + }, + { + "epoch": 1.4295060516846583, + "grad_norm": 0.7983576059341431, + "learning_rate": 9.861334469578946e-05, + "loss": 0.0987, + "step": 21850 + }, + { + "epoch": 1.4301602878639188, + "grad_norm": 0.8768244385719299, + "learning_rate": 9.861119552128523e-05, + "loss": 0.0961, + "step": 21860 + }, + { + "epoch": 1.4308145240431795, + "grad_norm": 1.0018515586853027, + "learning_rate": 9.86090447060257e-05, + "loss": 0.0906, + "step": 21870 + }, + { + "epoch": 1.4314687602224403, + "grad_norm": 0.9329746961593628, + "learning_rate": 9.86068922500834e-05, + "loss": 0.0925, + "step": 21880 + }, + { + "epoch": 1.432122996401701, + "grad_norm": 0.8282088041305542, + "learning_rate": 9.860473815353102e-05, + "loss": 0.1014, + "step": 21890 + }, + { + "epoch": 1.4327772325809618, + "grad_norm": 0.7623408436775208, + "learning_rate": 9.860258241644126e-05, + "loss": 0.1107, + "step": 21900 + }, + { + "epoch": 1.4334314687602223, + "grad_norm": 0.8091941475868225, + "learning_rate": 9.860042503888687e-05, + "loss": 0.0936, + "step": 21910 + }, + { + "epoch": 1.4340857049394833, + "grad_norm": 0.7656387686729431, + "learning_rate": 9.859826602094068e-05, + "loss": 0.091, + "step": 21920 + }, + { + "epoch": 1.4347399411187438, + "grad_norm": 1.1164404153823853, + "learning_rate": 9.859610536267556e-05, + "loss": 0.1016, + "step": 21930 + }, + { + "epoch": 1.4353941772980046, + "grad_norm": 0.9509071707725525, + "learning_rate": 9.859394306416444e-05, + "loss": 0.0934, + "step": 21940 + }, + { + "epoch": 1.4360484134772653, + "grad_norm": 0.9692185521125793, + "learning_rate": 9.859177912548028e-05, + "loss": 0.1056, + "step": 21950 + }, + { + "epoch": 1.436702649656526, + "grad_norm": 0.9519806504249573, + "learning_rate": 9.858961354669616e-05, + "loss": 0.1038, + "step": 21960 + }, + { + "epoch": 1.4373568858357868, + "grad_norm": 1.0304690599441528, + "learning_rate": 9.858744632788514e-05, + "loss": 0.1002, + "step": 21970 + }, + { + "epoch": 1.4380111220150473, + "grad_norm": 0.7147766947746277, + "learning_rate": 9.858527746912039e-05, + "loss": 0.1132, + "step": 21980 + }, + { + "epoch": 1.438665358194308, + "grad_norm": 0.9351019263267517, + "learning_rate": 9.85831069704751e-05, + "loss": 0.0982, + "step": 21990 + }, + { + "epoch": 1.4393195943735688, + "grad_norm": 1.303455114364624, + "learning_rate": 9.858093483202254e-05, + "loss": 0.112, + "step": 22000 + }, + { + "epoch": 1.4399738305528296, + "grad_norm": 0.778627872467041, + "learning_rate": 9.857876105383602e-05, + "loss": 0.1028, + "step": 22010 + }, + { + "epoch": 1.4406280667320903, + "grad_norm": 1.0590648651123047, + "learning_rate": 9.85765856359889e-05, + "loss": 0.1024, + "step": 22020 + }, + { + "epoch": 1.441282302911351, + "grad_norm": 0.9106878042221069, + "learning_rate": 9.857440857855462e-05, + "loss": 0.0975, + "step": 22030 + }, + { + "epoch": 1.4419365390906118, + "grad_norm": 0.8523755669593811, + "learning_rate": 9.857222988160667e-05, + "loss": 0.105, + "step": 22040 + }, + { + "epoch": 1.4425907752698723, + "grad_norm": 0.8732689619064331, + "learning_rate": 9.857004954521858e-05, + "loss": 0.0981, + "step": 22050 + }, + { + "epoch": 1.443245011449133, + "grad_norm": 0.9251449704170227, + "learning_rate": 9.856786756946392e-05, + "loss": 0.0962, + "step": 22060 + }, + { + "epoch": 1.4438992476283938, + "grad_norm": 0.7228383421897888, + "learning_rate": 9.856568395441637e-05, + "loss": 0.0949, + "step": 22070 + }, + { + "epoch": 1.4445534838076546, + "grad_norm": 0.7796227931976318, + "learning_rate": 9.856349870014961e-05, + "loss": 0.0964, + "step": 22080 + }, + { + "epoch": 1.4452077199869153, + "grad_norm": 0.8933011889457703, + "learning_rate": 9.856131180673742e-05, + "loss": 0.0917, + "step": 22090 + }, + { + "epoch": 1.4458619561661759, + "grad_norm": 0.6613444685935974, + "learning_rate": 9.855912327425359e-05, + "loss": 0.0944, + "step": 22100 + }, + { + "epoch": 1.4465161923454368, + "grad_norm": 0.8646618723869324, + "learning_rate": 9.8556933102772e-05, + "loss": 0.0989, + "step": 22110 + }, + { + "epoch": 1.4471704285246973, + "grad_norm": 1.0366922616958618, + "learning_rate": 9.855474129236657e-05, + "loss": 0.0964, + "step": 22120 + }, + { + "epoch": 1.447824664703958, + "grad_norm": 0.78702712059021, + "learning_rate": 9.855254784311129e-05, + "loss": 0.1023, + "step": 22130 + }, + { + "epoch": 1.4484789008832188, + "grad_norm": 0.685092568397522, + "learning_rate": 9.855035275508017e-05, + "loss": 0.0978, + "step": 22140 + }, + { + "epoch": 1.4491331370624796, + "grad_norm": 0.9662759900093079, + "learning_rate": 9.854815602834733e-05, + "loss": 0.101, + "step": 22150 + }, + { + "epoch": 1.4497873732417403, + "grad_norm": 0.8414245843887329, + "learning_rate": 9.854595766298692e-05, + "loss": 0.0887, + "step": 22160 + }, + { + "epoch": 1.4504416094210009, + "grad_norm": 0.9167496562004089, + "learning_rate": 9.854375765907309e-05, + "loss": 0.097, + "step": 22170 + }, + { + "epoch": 1.4510958456002616, + "grad_norm": 0.8719131946563721, + "learning_rate": 9.854155601668013e-05, + "loss": 0.0972, + "step": 22180 + }, + { + "epoch": 1.4517500817795224, + "grad_norm": 1.0566436052322388, + "learning_rate": 9.853935273588236e-05, + "loss": 0.1119, + "step": 22190 + }, + { + "epoch": 1.452404317958783, + "grad_norm": 0.8643470406532288, + "learning_rate": 9.853714781675414e-05, + "loss": 0.1097, + "step": 22200 + }, + { + "epoch": 1.4530585541380439, + "grad_norm": 0.8405054211616516, + "learning_rate": 9.853494125936989e-05, + "loss": 0.1023, + "step": 22210 + }, + { + "epoch": 1.4537127903173046, + "grad_norm": 0.8226882219314575, + "learning_rate": 9.853273306380407e-05, + "loss": 0.1037, + "step": 22220 + }, + { + "epoch": 1.4543670264965654, + "grad_norm": 0.9032091498374939, + "learning_rate": 9.853052323013124e-05, + "loss": 0.09, + "step": 22230 + }, + { + "epoch": 1.4550212626758259, + "grad_norm": 0.7893581986427307, + "learning_rate": 9.852831175842596e-05, + "loss": 0.1065, + "step": 22240 + }, + { + "epoch": 1.4556754988550866, + "grad_norm": 0.8432521820068359, + "learning_rate": 9.85260986487629e-05, + "loss": 0.0988, + "step": 22250 + }, + { + "epoch": 1.4563297350343474, + "grad_norm": 0.8194817304611206, + "learning_rate": 9.852388390121675e-05, + "loss": 0.1014, + "step": 22260 + }, + { + "epoch": 1.4569839712136081, + "grad_norm": 0.7900116443634033, + "learning_rate": 9.852166751586225e-05, + "loss": 0.1056, + "step": 22270 + }, + { + "epoch": 1.4576382073928689, + "grad_norm": 0.7171338200569153, + "learning_rate": 9.851944949277423e-05, + "loss": 0.0906, + "step": 22280 + }, + { + "epoch": 1.4582924435721296, + "grad_norm": 0.7563906311988831, + "learning_rate": 9.851722983202753e-05, + "loss": 0.0959, + "step": 22290 + }, + { + "epoch": 1.4589466797513904, + "grad_norm": 0.7728096842765808, + "learning_rate": 9.851500853369709e-05, + "loss": 0.1029, + "step": 22300 + }, + { + "epoch": 1.4596009159306509, + "grad_norm": 0.9391055107116699, + "learning_rate": 9.851278559785788e-05, + "loss": 0.0979, + "step": 22310 + }, + { + "epoch": 1.4602551521099116, + "grad_norm": 0.8876420855522156, + "learning_rate": 9.851056102458492e-05, + "loss": 0.0963, + "step": 22320 + }, + { + "epoch": 1.4609093882891724, + "grad_norm": 1.0009467601776123, + "learning_rate": 9.85083348139533e-05, + "loss": 0.098, + "step": 22330 + }, + { + "epoch": 1.4615636244684331, + "grad_norm": 0.9100044369697571, + "learning_rate": 9.850610696603817e-05, + "loss": 0.0935, + "step": 22340 + }, + { + "epoch": 1.4622178606476939, + "grad_norm": 1.0415489673614502, + "learning_rate": 9.850387748091471e-05, + "loss": 0.104, + "step": 22350 + }, + { + "epoch": 1.4628720968269544, + "grad_norm": 0.9339014887809753, + "learning_rate": 9.850164635865819e-05, + "loss": 0.1056, + "step": 22360 + }, + { + "epoch": 1.4635263330062154, + "grad_norm": 0.9061875343322754, + "learning_rate": 9.84994135993439e-05, + "loss": 0.1065, + "step": 22370 + }, + { + "epoch": 1.464180569185476, + "grad_norm": 0.7294244766235352, + "learning_rate": 9.849717920304719e-05, + "loss": 0.1058, + "step": 22380 + }, + { + "epoch": 1.4648348053647366, + "grad_norm": 0.9181326031684875, + "learning_rate": 9.849494316984352e-05, + "loss": 0.0996, + "step": 22390 + }, + { + "epoch": 1.4654890415439974, + "grad_norm": 0.7590892910957336, + "learning_rate": 9.849270549980832e-05, + "loss": 0.1013, + "step": 22400 + }, + { + "epoch": 1.4661432777232581, + "grad_norm": 0.7892376780509949, + "learning_rate": 9.849046619301713e-05, + "loss": 0.0953, + "step": 22410 + }, + { + "epoch": 1.466797513902519, + "grad_norm": 0.7668907642364502, + "learning_rate": 9.848822524954553e-05, + "loss": 0.1001, + "step": 22420 + }, + { + "epoch": 1.4674517500817794, + "grad_norm": 0.6292420625686646, + "learning_rate": 9.848598266946918e-05, + "loss": 0.0882, + "step": 22430 + }, + { + "epoch": 1.4681059862610402, + "grad_norm": 0.8127971887588501, + "learning_rate": 9.848373845286376e-05, + "loss": 0.1029, + "step": 22440 + }, + { + "epoch": 1.468760222440301, + "grad_norm": 0.9663169980049133, + "learning_rate": 9.848149259980499e-05, + "loss": 0.1066, + "step": 22450 + }, + { + "epoch": 1.4694144586195617, + "grad_norm": 0.8952876925468445, + "learning_rate": 9.847924511036872e-05, + "loss": 0.1101, + "step": 22460 + }, + { + "epoch": 1.4700686947988224, + "grad_norm": 0.9786347150802612, + "learning_rate": 9.847699598463079e-05, + "loss": 0.0865, + "step": 22470 + }, + { + "epoch": 1.4707229309780832, + "grad_norm": 1.0925096273422241, + "learning_rate": 9.847474522266708e-05, + "loss": 0.1013, + "step": 22480 + }, + { + "epoch": 1.471377167157344, + "grad_norm": 0.7084100842475891, + "learning_rate": 9.84724928245536e-05, + "loss": 0.0975, + "step": 22490 + }, + { + "epoch": 1.4720314033366044, + "grad_norm": 0.8973484039306641, + "learning_rate": 9.847023879036637e-05, + "loss": 0.0956, + "step": 22500 + }, + { + "epoch": 1.4726856395158652, + "grad_norm": 1.1839032173156738, + "learning_rate": 9.846798312018146e-05, + "loss": 0.0958, + "step": 22510 + }, + { + "epoch": 1.473339875695126, + "grad_norm": 1.004971981048584, + "learning_rate": 9.846572581407502e-05, + "loss": 0.1028, + "step": 22520 + }, + { + "epoch": 1.4739941118743867, + "grad_norm": 0.854030966758728, + "learning_rate": 9.846346687212322e-05, + "loss": 0.1093, + "step": 22530 + }, + { + "epoch": 1.4746483480536474, + "grad_norm": 0.9426572918891907, + "learning_rate": 9.846120629440231e-05, + "loss": 0.1084, + "step": 22540 + }, + { + "epoch": 1.475302584232908, + "grad_norm": 0.7341740131378174, + "learning_rate": 9.84589440809886e-05, + "loss": 0.0945, + "step": 22550 + }, + { + "epoch": 1.475956820412169, + "grad_norm": 0.7816892862319946, + "learning_rate": 9.845668023195841e-05, + "loss": 0.0945, + "step": 22560 + }, + { + "epoch": 1.4766110565914294, + "grad_norm": 1.015047311782837, + "learning_rate": 9.845441474738821e-05, + "loss": 0.0912, + "step": 22570 + }, + { + "epoch": 1.4772652927706902, + "grad_norm": 1.0794068574905396, + "learning_rate": 9.845214762735444e-05, + "loss": 0.092, + "step": 22580 + }, + { + "epoch": 1.477919528949951, + "grad_norm": 0.8350719809532166, + "learning_rate": 9.84498788719336e-05, + "loss": 0.1005, + "step": 22590 + }, + { + "epoch": 1.4785737651292117, + "grad_norm": 0.7004404664039612, + "learning_rate": 9.844760848120229e-05, + "loss": 0.0958, + "step": 22600 + }, + { + "epoch": 1.4792280013084724, + "grad_norm": 0.9362242221832275, + "learning_rate": 9.844533645523714e-05, + "loss": 0.0953, + "step": 22610 + }, + { + "epoch": 1.479882237487733, + "grad_norm": 1.0264415740966797, + "learning_rate": 9.844306279411482e-05, + "loss": 0.0954, + "step": 22620 + }, + { + "epoch": 1.4805364736669937, + "grad_norm": 0.9986675977706909, + "learning_rate": 9.84407874979121e-05, + "loss": 0.1014, + "step": 22630 + }, + { + "epoch": 1.4811907098462544, + "grad_norm": 0.8744516372680664, + "learning_rate": 9.843851056670574e-05, + "loss": 0.0955, + "step": 22640 + }, + { + "epoch": 1.4818449460255152, + "grad_norm": 0.8320896029472351, + "learning_rate": 9.843623200057263e-05, + "loss": 0.0927, + "step": 22650 + }, + { + "epoch": 1.482499182204776, + "grad_norm": 0.9211897850036621, + "learning_rate": 9.843395179958965e-05, + "loss": 0.1046, + "step": 22660 + }, + { + "epoch": 1.4831534183840367, + "grad_norm": 0.9335985779762268, + "learning_rate": 9.84316699638338e-05, + "loss": 0.0968, + "step": 22670 + }, + { + "epoch": 1.4838076545632974, + "grad_norm": 0.9013245105743408, + "learning_rate": 9.842938649338205e-05, + "loss": 0.0968, + "step": 22680 + }, + { + "epoch": 1.484461890742558, + "grad_norm": 1.0684399604797363, + "learning_rate": 9.842710138831148e-05, + "loss": 0.0904, + "step": 22690 + }, + { + "epoch": 1.4851161269218187, + "grad_norm": 1.023200273513794, + "learning_rate": 9.842481464869927e-05, + "loss": 0.099, + "step": 22700 + }, + { + "epoch": 1.4857703631010795, + "grad_norm": 0.9011964797973633, + "learning_rate": 9.842252627462254e-05, + "loss": 0.0982, + "step": 22710 + }, + { + "epoch": 1.4864245992803402, + "grad_norm": 0.9003528356552124, + "learning_rate": 9.842023626615857e-05, + "loss": 0.0916, + "step": 22720 + }, + { + "epoch": 1.487078835459601, + "grad_norm": 0.8671954274177551, + "learning_rate": 9.841794462338463e-05, + "loss": 0.0844, + "step": 22730 + }, + { + "epoch": 1.4877330716388617, + "grad_norm": 1.0183930397033691, + "learning_rate": 9.841565134637808e-05, + "loss": 0.1009, + "step": 22740 + }, + { + "epoch": 1.4883873078181225, + "grad_norm": 0.77434903383255, + "learning_rate": 9.841335643521632e-05, + "loss": 0.0979, + "step": 22750 + }, + { + "epoch": 1.489041543997383, + "grad_norm": 0.9142956733703613, + "learning_rate": 9.841105988997682e-05, + "loss": 0.0911, + "step": 22760 + }, + { + "epoch": 1.4896957801766437, + "grad_norm": 0.9437901973724365, + "learning_rate": 9.840876171073707e-05, + "loss": 0.101, + "step": 22770 + }, + { + "epoch": 1.4903500163559045, + "grad_norm": 0.8872728943824768, + "learning_rate": 9.840646189757468e-05, + "loss": 0.0982, + "step": 22780 + }, + { + "epoch": 1.4910042525351652, + "grad_norm": 0.7070728540420532, + "learning_rate": 9.840416045056724e-05, + "loss": 0.0918, + "step": 22790 + }, + { + "epoch": 1.491658488714426, + "grad_norm": 0.9747138619422913, + "learning_rate": 9.840185736979244e-05, + "loss": 0.0953, + "step": 22800 + }, + { + "epoch": 1.4923127248936865, + "grad_norm": 0.9739376902580261, + "learning_rate": 9.839955265532801e-05, + "loss": 0.1069, + "step": 22810 + }, + { + "epoch": 1.4929669610729475, + "grad_norm": 0.8057883977890015, + "learning_rate": 9.839724630725175e-05, + "loss": 0.0882, + "step": 22820 + }, + { + "epoch": 1.493621197252208, + "grad_norm": 0.9417016506195068, + "learning_rate": 9.839493832564149e-05, + "loss": 0.1076, + "step": 22830 + }, + { + "epoch": 1.4942754334314687, + "grad_norm": 0.7495242953300476, + "learning_rate": 9.839262871057515e-05, + "loss": 0.091, + "step": 22840 + }, + { + "epoch": 1.4949296696107295, + "grad_norm": 0.818233847618103, + "learning_rate": 9.839031746213068e-05, + "loss": 0.1056, + "step": 22850 + }, + { + "epoch": 1.4955839057899902, + "grad_norm": 0.8192006945610046, + "learning_rate": 9.838800458038609e-05, + "loss": 0.0928, + "step": 22860 + }, + { + "epoch": 1.496238141969251, + "grad_norm": 0.9962643980979919, + "learning_rate": 9.838569006541944e-05, + "loss": 0.097, + "step": 22870 + }, + { + "epoch": 1.4968923781485115, + "grad_norm": 0.8884673714637756, + "learning_rate": 9.838337391730886e-05, + "loss": 0.0924, + "step": 22880 + }, + { + "epoch": 1.4975466143277723, + "grad_norm": 0.7692938446998596, + "learning_rate": 9.83810561361325e-05, + "loss": 0.1011, + "step": 22890 + }, + { + "epoch": 1.498200850507033, + "grad_norm": 1.0276055335998535, + "learning_rate": 9.837873672196863e-05, + "loss": 0.097, + "step": 22900 + }, + { + "epoch": 1.4988550866862937, + "grad_norm": 0.9219088554382324, + "learning_rate": 9.83764156748955e-05, + "loss": 0.0843, + "step": 22910 + }, + { + "epoch": 1.4995093228655545, + "grad_norm": 0.7710665464401245, + "learning_rate": 9.837409299499149e-05, + "loss": 0.0997, + "step": 22920 + }, + { + "epoch": 1.500163559044815, + "grad_norm": 0.7890987396240234, + "learning_rate": 9.837176868233496e-05, + "loss": 0.1039, + "step": 22930 + }, + { + "epoch": 1.500817795224076, + "grad_norm": 0.7325422167778015, + "learning_rate": 9.836944273700439e-05, + "loss": 0.0917, + "step": 22940 + }, + { + "epoch": 1.5014720314033365, + "grad_norm": 0.8291808366775513, + "learning_rate": 9.836711515907827e-05, + "loss": 0.1017, + "step": 22950 + }, + { + "epoch": 1.5021262675825973, + "grad_norm": 0.7755422592163086, + "learning_rate": 9.836478594863516e-05, + "loss": 0.0887, + "step": 22960 + }, + { + "epoch": 1.502780503761858, + "grad_norm": 0.8843225836753845, + "learning_rate": 9.836245510575368e-05, + "loss": 0.1055, + "step": 22970 + }, + { + "epoch": 1.5034347399411188, + "grad_norm": 1.0079516172409058, + "learning_rate": 9.836012263051252e-05, + "loss": 0.1042, + "step": 22980 + }, + { + "epoch": 1.5040889761203795, + "grad_norm": 0.6518259048461914, + "learning_rate": 9.835778852299039e-05, + "loss": 0.1012, + "step": 22990 + }, + { + "epoch": 1.50474321229964, + "grad_norm": 0.7768208980560303, + "learning_rate": 9.835545278326606e-05, + "loss": 0.0852, + "step": 23000 + }, + { + "epoch": 1.505397448478901, + "grad_norm": 0.6538995504379272, + "learning_rate": 9.835311541141839e-05, + "loss": 0.0882, + "step": 23010 + }, + { + "epoch": 1.5060516846581615, + "grad_norm": 0.8296500444412231, + "learning_rate": 9.835077640752626e-05, + "loss": 0.0938, + "step": 23020 + }, + { + "epoch": 1.5067059208374223, + "grad_norm": 0.9206341505050659, + "learning_rate": 9.834843577166863e-05, + "loss": 0.0974, + "step": 23030 + }, + { + "epoch": 1.507360157016683, + "grad_norm": 0.779178261756897, + "learning_rate": 9.83460935039245e-05, + "loss": 0.1044, + "step": 23040 + }, + { + "epoch": 1.5080143931959438, + "grad_norm": 0.9146134853363037, + "learning_rate": 9.834374960437291e-05, + "loss": 0.0909, + "step": 23050 + }, + { + "epoch": 1.5086686293752045, + "grad_norm": 0.9325788617134094, + "learning_rate": 9.834140407309298e-05, + "loss": 0.0969, + "step": 23060 + }, + { + "epoch": 1.509322865554465, + "grad_norm": 0.9949310421943665, + "learning_rate": 9.833905691016389e-05, + "loss": 0.0969, + "step": 23070 + }, + { + "epoch": 1.509977101733726, + "grad_norm": 0.8266165256500244, + "learning_rate": 9.833670811566485e-05, + "loss": 0.0924, + "step": 23080 + }, + { + "epoch": 1.5106313379129865, + "grad_norm": 0.8005866408348083, + "learning_rate": 9.833435768967514e-05, + "loss": 0.0931, + "step": 23090 + }, + { + "epoch": 1.5112855740922473, + "grad_norm": 0.9937982559204102, + "learning_rate": 9.833200563227411e-05, + "loss": 0.0897, + "step": 23100 + }, + { + "epoch": 1.511939810271508, + "grad_norm": 0.8967451453208923, + "learning_rate": 9.832965194354113e-05, + "loss": 0.1015, + "step": 23110 + }, + { + "epoch": 1.5125940464507688, + "grad_norm": 0.7568728923797607, + "learning_rate": 9.832729662355566e-05, + "loss": 0.0945, + "step": 23120 + }, + { + "epoch": 1.5132482826300295, + "grad_norm": 0.8389810919761658, + "learning_rate": 9.832493967239716e-05, + "loss": 0.0958, + "step": 23130 + }, + { + "epoch": 1.51390251880929, + "grad_norm": 0.7844974398612976, + "learning_rate": 9.832258109014522e-05, + "loss": 0.0963, + "step": 23140 + }, + { + "epoch": 1.514556754988551, + "grad_norm": 0.7761784791946411, + "learning_rate": 9.832022087687944e-05, + "loss": 0.0913, + "step": 23150 + }, + { + "epoch": 1.5152109911678115, + "grad_norm": 1.0485121011734009, + "learning_rate": 9.831785903267949e-05, + "loss": 0.1062, + "step": 23160 + }, + { + "epoch": 1.5158652273470723, + "grad_norm": 0.8118041157722473, + "learning_rate": 9.831549555762507e-05, + "loss": 0.0926, + "step": 23170 + }, + { + "epoch": 1.516519463526333, + "grad_norm": 1.069348931312561, + "learning_rate": 9.831313045179595e-05, + "loss": 0.0977, + "step": 23180 + }, + { + "epoch": 1.5171736997055936, + "grad_norm": 0.8095777034759521, + "learning_rate": 9.8310763715272e-05, + "loss": 0.0975, + "step": 23190 + }, + { + "epoch": 1.5178279358848545, + "grad_norm": 0.6841381788253784, + "learning_rate": 9.830839534813305e-05, + "loss": 0.0924, + "step": 23200 + }, + { + "epoch": 1.518482172064115, + "grad_norm": 0.7451679706573486, + "learning_rate": 9.830602535045908e-05, + "loss": 0.0989, + "step": 23210 + }, + { + "epoch": 1.5191364082433758, + "grad_norm": 0.7688360214233398, + "learning_rate": 9.830365372233006e-05, + "loss": 0.1033, + "step": 23220 + }, + { + "epoch": 1.5197906444226366, + "grad_norm": 0.7664195895195007, + "learning_rate": 9.830128046382605e-05, + "loss": 0.0857, + "step": 23230 + }, + { + "epoch": 1.5204448806018973, + "grad_norm": 0.7574120163917542, + "learning_rate": 9.829890557502714e-05, + "loss": 0.0912, + "step": 23240 + }, + { + "epoch": 1.521099116781158, + "grad_norm": 0.8044725656509399, + "learning_rate": 9.82965290560135e-05, + "loss": 0.093, + "step": 23250 + }, + { + "epoch": 1.5217533529604186, + "grad_norm": 0.9325090050697327, + "learning_rate": 9.829415090686535e-05, + "loss": 0.0893, + "step": 23260 + }, + { + "epoch": 1.5224075891396796, + "grad_norm": 0.9916201829910278, + "learning_rate": 9.829177112766294e-05, + "loss": 0.0998, + "step": 23270 + }, + { + "epoch": 1.52306182531894, + "grad_norm": 0.8349946141242981, + "learning_rate": 9.828938971848663e-05, + "loss": 0.0991, + "step": 23280 + }, + { + "epoch": 1.5237160614982008, + "grad_norm": 0.8166770339012146, + "learning_rate": 9.828700667941675e-05, + "loss": 0.0983, + "step": 23290 + }, + { + "epoch": 1.5243702976774616, + "grad_norm": 0.8615389466285706, + "learning_rate": 9.828462201053376e-05, + "loss": 0.0957, + "step": 23300 + }, + { + "epoch": 1.5250245338567223, + "grad_norm": 0.8796236515045166, + "learning_rate": 9.828223571191814e-05, + "loss": 0.0974, + "step": 23310 + }, + { + "epoch": 1.525678770035983, + "grad_norm": 0.8417108654975891, + "learning_rate": 9.827984778365045e-05, + "loss": 0.1001, + "step": 23320 + }, + { + "epoch": 1.5263330062152436, + "grad_norm": 0.8357135653495789, + "learning_rate": 9.827745822581128e-05, + "loss": 0.1045, + "step": 23330 + }, + { + "epoch": 1.5269872423945046, + "grad_norm": 0.7510766386985779, + "learning_rate": 9.827506703848128e-05, + "loss": 0.1021, + "step": 23340 + }, + { + "epoch": 1.527641478573765, + "grad_norm": 1.0681124925613403, + "learning_rate": 9.827267422174115e-05, + "loss": 0.0953, + "step": 23350 + }, + { + "epoch": 1.5282957147530258, + "grad_norm": 0.8562158942222595, + "learning_rate": 9.82702797756717e-05, + "loss": 0.0992, + "step": 23360 + }, + { + "epoch": 1.5289499509322866, + "grad_norm": 0.9547275900840759, + "learning_rate": 9.826788370035368e-05, + "loss": 0.1004, + "step": 23370 + }, + { + "epoch": 1.529604187111547, + "grad_norm": 0.8317331075668335, + "learning_rate": 9.826548599586802e-05, + "loss": 0.1016, + "step": 23380 + }, + { + "epoch": 1.530258423290808, + "grad_norm": 0.9036648273468018, + "learning_rate": 9.82630866622956e-05, + "loss": 0.1068, + "step": 23390 + }, + { + "epoch": 1.5309126594700686, + "grad_norm": 0.9889810085296631, + "learning_rate": 9.826068569971745e-05, + "loss": 0.0998, + "step": 23400 + }, + { + "epoch": 1.5315668956493296, + "grad_norm": 0.8592228293418884, + "learning_rate": 9.825828310821459e-05, + "loss": 0.0989, + "step": 23410 + }, + { + "epoch": 1.53222113182859, + "grad_norm": 0.8532178401947021, + "learning_rate": 9.82558788878681e-05, + "loss": 0.097, + "step": 23420 + }, + { + "epoch": 1.5328753680078508, + "grad_norm": 0.9547865390777588, + "learning_rate": 9.825347303875916e-05, + "loss": 0.0851, + "step": 23430 + }, + { + "epoch": 1.5335296041871116, + "grad_norm": 0.833165168762207, + "learning_rate": 9.825106556096894e-05, + "loss": 0.1017, + "step": 23440 + }, + { + "epoch": 1.5341838403663721, + "grad_norm": 1.1163369417190552, + "learning_rate": 9.824865645457872e-05, + "loss": 0.0899, + "step": 23450 + }, + { + "epoch": 1.534838076545633, + "grad_norm": 0.8560509085655212, + "learning_rate": 9.824624571966981e-05, + "loss": 0.0965, + "step": 23460 + }, + { + "epoch": 1.5354923127248936, + "grad_norm": 0.8199770450592041, + "learning_rate": 9.824383335632357e-05, + "loss": 0.0981, + "step": 23470 + }, + { + "epoch": 1.5361465489041544, + "grad_norm": 0.8420577645301819, + "learning_rate": 9.824141936462144e-05, + "loss": 0.0892, + "step": 23480 + }, + { + "epoch": 1.536800785083415, + "grad_norm": 0.8969340324401855, + "learning_rate": 9.823900374464487e-05, + "loss": 0.1015, + "step": 23490 + }, + { + "epoch": 1.5374550212626759, + "grad_norm": 1.0403921604156494, + "learning_rate": 9.823658649647544e-05, + "loss": 0.0966, + "step": 23500 + }, + { + "epoch": 1.5381092574419366, + "grad_norm": 0.866157591342926, + "learning_rate": 9.82341676201947e-05, + "loss": 0.098, + "step": 23510 + }, + { + "epoch": 1.5387634936211971, + "grad_norm": 0.8696637153625488, + "learning_rate": 9.82317471158843e-05, + "loss": 0.0963, + "step": 23520 + }, + { + "epoch": 1.539417729800458, + "grad_norm": 0.7463034987449646, + "learning_rate": 9.822932498362593e-05, + "loss": 0.1031, + "step": 23530 + }, + { + "epoch": 1.5400719659797186, + "grad_norm": 0.8361960053443909, + "learning_rate": 9.822690122350138e-05, + "loss": 0.0929, + "step": 23540 + }, + { + "epoch": 1.5407262021589794, + "grad_norm": 0.8186551928520203, + "learning_rate": 9.822447583559242e-05, + "loss": 0.1008, + "step": 23550 + }, + { + "epoch": 1.5413804383382401, + "grad_norm": 0.8494019508361816, + "learning_rate": 9.822204881998093e-05, + "loss": 0.0876, + "step": 23560 + }, + { + "epoch": 1.5420346745175009, + "grad_norm": 0.9531555771827698, + "learning_rate": 9.821962017674881e-05, + "loss": 0.094, + "step": 23570 + }, + { + "epoch": 1.5426889106967616, + "grad_norm": 0.6968259215354919, + "learning_rate": 9.821718990597808e-05, + "loss": 0.0877, + "step": 23580 + }, + { + "epoch": 1.5433431468760221, + "grad_norm": 0.864067018032074, + "learning_rate": 9.821475800775072e-05, + "loss": 0.0921, + "step": 23590 + }, + { + "epoch": 1.5439973830552831, + "grad_norm": 0.6976091861724854, + "learning_rate": 9.821232448214883e-05, + "loss": 0.0921, + "step": 23600 + }, + { + "epoch": 1.5446516192345436, + "grad_norm": 0.83482426404953, + "learning_rate": 9.820988932925455e-05, + "loss": 0.1032, + "step": 23610 + }, + { + "epoch": 1.5453058554138044, + "grad_norm": 0.8034452199935913, + "learning_rate": 9.820745254915005e-05, + "loss": 0.0894, + "step": 23620 + }, + { + "epoch": 1.5459600915930651, + "grad_norm": 0.7795824408531189, + "learning_rate": 9.820501414191763e-05, + "loss": 0.0911, + "step": 23630 + }, + { + "epoch": 1.5466143277723257, + "grad_norm": 0.7786241769790649, + "learning_rate": 9.820257410763953e-05, + "loss": 0.0885, + "step": 23640 + }, + { + "epoch": 1.5472685639515866, + "grad_norm": 0.8625045418739319, + "learning_rate": 9.820013244639816e-05, + "loss": 0.0974, + "step": 23650 + }, + { + "epoch": 1.5479228001308472, + "grad_norm": 0.7645002603530884, + "learning_rate": 9.81976891582759e-05, + "loss": 0.097, + "step": 23660 + }, + { + "epoch": 1.548577036310108, + "grad_norm": 0.978847861289978, + "learning_rate": 9.819524424335524e-05, + "loss": 0.0958, + "step": 23670 + }, + { + "epoch": 1.5492312724893686, + "grad_norm": 0.9067314267158508, + "learning_rate": 9.819279770171867e-05, + "loss": 0.0973, + "step": 23680 + }, + { + "epoch": 1.5498855086686294, + "grad_norm": 0.8743839263916016, + "learning_rate": 9.819034953344881e-05, + "loss": 0.1002, + "step": 23690 + }, + { + "epoch": 1.5505397448478901, + "grad_norm": 0.7451719641685486, + "learning_rate": 9.818789973862825e-05, + "loss": 0.0889, + "step": 23700 + }, + { + "epoch": 1.5511939810271507, + "grad_norm": 0.833907961845398, + "learning_rate": 9.818544831733971e-05, + "loss": 0.092, + "step": 23710 + }, + { + "epoch": 1.5518482172064116, + "grad_norm": 0.8655661344528198, + "learning_rate": 9.81829952696659e-05, + "loss": 0.0924, + "step": 23720 + }, + { + "epoch": 1.5525024533856722, + "grad_norm": 0.8889705538749695, + "learning_rate": 9.818054059568966e-05, + "loss": 0.0926, + "step": 23730 + }, + { + "epoch": 1.553156689564933, + "grad_norm": 0.9665441513061523, + "learning_rate": 9.81780842954938e-05, + "loss": 0.105, + "step": 23740 + }, + { + "epoch": 1.5538109257441937, + "grad_norm": 0.8035663962364197, + "learning_rate": 9.817562636916125e-05, + "loss": 0.1007, + "step": 23750 + }, + { + "epoch": 1.5544651619234544, + "grad_norm": 0.8797777891159058, + "learning_rate": 9.817316681677496e-05, + "loss": 0.1106, + "step": 23760 + }, + { + "epoch": 1.5551193981027152, + "grad_norm": 0.799263060092926, + "learning_rate": 9.817070563841795e-05, + "loss": 0.0978, + "step": 23770 + }, + { + "epoch": 1.5557736342819757, + "grad_norm": 0.8599784970283508, + "learning_rate": 9.81682428341733e-05, + "loss": 0.1008, + "step": 23780 + }, + { + "epoch": 1.5564278704612367, + "grad_norm": 0.8856527805328369, + "learning_rate": 9.816577840412414e-05, + "loss": 0.0869, + "step": 23790 + }, + { + "epoch": 1.5570821066404972, + "grad_norm": 0.6909691691398621, + "learning_rate": 9.81633123483536e-05, + "loss": 0.0982, + "step": 23800 + }, + { + "epoch": 1.557736342819758, + "grad_norm": 0.8058643937110901, + "learning_rate": 9.8160844666945e-05, + "loss": 0.0944, + "step": 23810 + }, + { + "epoch": 1.5583905789990187, + "grad_norm": 1.1985560655593872, + "learning_rate": 9.815837535998156e-05, + "loss": 0.1016, + "step": 23820 + }, + { + "epoch": 1.5590448151782792, + "grad_norm": 0.8352839946746826, + "learning_rate": 9.815590442754666e-05, + "loss": 0.1072, + "step": 23830 + }, + { + "epoch": 1.5596990513575402, + "grad_norm": 0.9324560761451721, + "learning_rate": 9.815343186972369e-05, + "loss": 0.1034, + "step": 23840 + }, + { + "epoch": 1.5603532875368007, + "grad_norm": 0.8299842476844788, + "learning_rate": 9.81509576865961e-05, + "loss": 0.0935, + "step": 23850 + }, + { + "epoch": 1.5610075237160617, + "grad_norm": 0.9215813875198364, + "learning_rate": 9.814848187824742e-05, + "loss": 0.0986, + "step": 23860 + }, + { + "epoch": 1.5616617598953222, + "grad_norm": 0.8809934854507446, + "learning_rate": 9.81460044447612e-05, + "loss": 0.0994, + "step": 23870 + }, + { + "epoch": 1.562315996074583, + "grad_norm": 0.8337189555168152, + "learning_rate": 9.814352538622106e-05, + "loss": 0.09, + "step": 23880 + }, + { + "epoch": 1.5629702322538437, + "grad_norm": 0.8376109004020691, + "learning_rate": 9.814104470271068e-05, + "loss": 0.103, + "step": 23890 + }, + { + "epoch": 1.5636244684331042, + "grad_norm": 0.7608706951141357, + "learning_rate": 9.813856239431378e-05, + "loss": 0.1002, + "step": 23900 + }, + { + "epoch": 1.5642787046123652, + "grad_norm": 0.7375048995018005, + "learning_rate": 9.813607846111416e-05, + "loss": 0.0901, + "step": 23910 + }, + { + "epoch": 1.5649329407916257, + "grad_norm": 0.7545016407966614, + "learning_rate": 9.813359290319563e-05, + "loss": 0.1055, + "step": 23920 + }, + { + "epoch": 1.5655871769708865, + "grad_norm": 0.7844988703727722, + "learning_rate": 9.813110572064212e-05, + "loss": 0.0992, + "step": 23930 + }, + { + "epoch": 1.5662414131501472, + "grad_norm": 0.7850669622421265, + "learning_rate": 9.812861691353757e-05, + "loss": 0.0912, + "step": 23940 + }, + { + "epoch": 1.566895649329408, + "grad_norm": 1.1129896640777588, + "learning_rate": 9.812612648196598e-05, + "loss": 0.0932, + "step": 23950 + }, + { + "epoch": 1.5675498855086687, + "grad_norm": 0.9184969663619995, + "learning_rate": 9.81236344260114e-05, + "loss": 0.1044, + "step": 23960 + }, + { + "epoch": 1.5682041216879292, + "grad_norm": 0.790431559085846, + "learning_rate": 9.812114074575793e-05, + "loss": 0.1014, + "step": 23970 + }, + { + "epoch": 1.5688583578671902, + "grad_norm": 0.7852354645729065, + "learning_rate": 9.811864544128978e-05, + "loss": 0.0822, + "step": 23980 + }, + { + "epoch": 1.5695125940464507, + "grad_norm": 0.7458804249763489, + "learning_rate": 9.811614851269114e-05, + "loss": 0.0996, + "step": 23990 + }, + { + "epoch": 1.5701668302257115, + "grad_norm": 0.7905547022819519, + "learning_rate": 9.81136499600463e-05, + "loss": 0.0905, + "step": 24000 + }, + { + "epoch": 1.5708210664049722, + "grad_norm": 0.8950849771499634, + "learning_rate": 9.811114978343961e-05, + "loss": 0.0945, + "step": 24010 + }, + { + "epoch": 1.571475302584233, + "grad_norm": 0.9239217638969421, + "learning_rate": 9.810864798295541e-05, + "loss": 0.0896, + "step": 24020 + }, + { + "epoch": 1.5721295387634937, + "grad_norm": 0.8260421752929688, + "learning_rate": 9.810614455867818e-05, + "loss": 0.0826, + "step": 24030 + }, + { + "epoch": 1.5727837749427542, + "grad_norm": 0.8214983940124512, + "learning_rate": 9.810363951069241e-05, + "loss": 0.0918, + "step": 24040 + }, + { + "epoch": 1.5734380111220152, + "grad_norm": 0.7926087975502014, + "learning_rate": 9.810113283908266e-05, + "loss": 0.0858, + "step": 24050 + }, + { + "epoch": 1.5740922473012757, + "grad_norm": 0.8653861284255981, + "learning_rate": 9.809862454393352e-05, + "loss": 0.1005, + "step": 24060 + }, + { + "epoch": 1.5747464834805365, + "grad_norm": 0.846734881401062, + "learning_rate": 9.809611462532964e-05, + "loss": 0.0908, + "step": 24070 + }, + { + "epoch": 1.5754007196597972, + "grad_norm": 1.0754064321517944, + "learning_rate": 9.809360308335578e-05, + "loss": 0.1067, + "step": 24080 + }, + { + "epoch": 1.5760549558390577, + "grad_norm": 0.7325643301010132, + "learning_rate": 9.809108991809668e-05, + "loss": 0.0927, + "step": 24090 + }, + { + "epoch": 1.5767091920183187, + "grad_norm": 0.8486270904541016, + "learning_rate": 9.808857512963717e-05, + "loss": 0.1034, + "step": 24100 + }, + { + "epoch": 1.5773634281975792, + "grad_norm": 0.8908068537712097, + "learning_rate": 9.808605871806213e-05, + "loss": 0.1011, + "step": 24110 + }, + { + "epoch": 1.57801766437684, + "grad_norm": 0.9737763404846191, + "learning_rate": 9.808354068345649e-05, + "loss": 0.1027, + "step": 24120 + }, + { + "epoch": 1.5786719005561007, + "grad_norm": 0.9432488083839417, + "learning_rate": 9.808102102590526e-05, + "loss": 0.1, + "step": 24130 + }, + { + "epoch": 1.5793261367353615, + "grad_norm": 0.8621255159378052, + "learning_rate": 9.807849974549347e-05, + "loss": 0.0851, + "step": 24140 + }, + { + "epoch": 1.5799803729146222, + "grad_norm": 0.8411361575126648, + "learning_rate": 9.807597684230623e-05, + "loss": 0.106, + "step": 24150 + }, + { + "epoch": 1.5806346090938828, + "grad_norm": 0.8948150873184204, + "learning_rate": 9.807345231642868e-05, + "loss": 0.1006, + "step": 24160 + }, + { + "epoch": 1.5812888452731437, + "grad_norm": 1.042441725730896, + "learning_rate": 9.807092616794605e-05, + "loss": 0.1131, + "step": 24170 + }, + { + "epoch": 1.5819430814524043, + "grad_norm": 1.0746121406555176, + "learning_rate": 9.806839839694358e-05, + "loss": 0.0919, + "step": 24180 + }, + { + "epoch": 1.582597317631665, + "grad_norm": 0.8730924725532532, + "learning_rate": 9.806586900350658e-05, + "loss": 0.0961, + "step": 24190 + }, + { + "epoch": 1.5832515538109257, + "grad_norm": 0.7895834445953369, + "learning_rate": 9.806333798772047e-05, + "loss": 0.0892, + "step": 24200 + }, + { + "epoch": 1.5839057899901865, + "grad_norm": 0.786361038684845, + "learning_rate": 9.806080534967065e-05, + "loss": 0.0953, + "step": 24210 + }, + { + "epoch": 1.5845600261694472, + "grad_norm": 0.8377049565315247, + "learning_rate": 9.80582710894426e-05, + "loss": 0.1049, + "step": 24220 + }, + { + "epoch": 1.5852142623487078, + "grad_norm": 0.7862370014190674, + "learning_rate": 9.805573520712186e-05, + "loss": 0.1034, + "step": 24230 + }, + { + "epoch": 1.5858684985279687, + "grad_norm": 0.8017224073410034, + "learning_rate": 9.805319770279404e-05, + "loss": 0.0996, + "step": 24240 + }, + { + "epoch": 1.5865227347072293, + "grad_norm": 0.8779584765434265, + "learning_rate": 9.805065857654476e-05, + "loss": 0.1031, + "step": 24250 + }, + { + "epoch": 1.58717697088649, + "grad_norm": 0.8980815410614014, + "learning_rate": 9.804811782845974e-05, + "loss": 0.0956, + "step": 24260 + }, + { + "epoch": 1.5878312070657508, + "grad_norm": 1.0139700174331665, + "learning_rate": 9.804557545862474e-05, + "loss": 0.0996, + "step": 24270 + }, + { + "epoch": 1.5884854432450113, + "grad_norm": 0.8047105073928833, + "learning_rate": 9.804303146712555e-05, + "loss": 0.1042, + "step": 24280 + }, + { + "epoch": 1.5891396794242723, + "grad_norm": 0.7328765988349915, + "learning_rate": 9.804048585404806e-05, + "loss": 0.0914, + "step": 24290 + }, + { + "epoch": 1.5897939156035328, + "grad_norm": 0.866847813129425, + "learning_rate": 9.803793861947816e-05, + "loss": 0.0952, + "step": 24300 + }, + { + "epoch": 1.5904481517827938, + "grad_norm": 0.8152327537536621, + "learning_rate": 9.803538976350189e-05, + "loss": 0.0915, + "step": 24310 + }, + { + "epoch": 1.5911023879620543, + "grad_norm": 0.7884990572929382, + "learning_rate": 9.80328392862052e-05, + "loss": 0.0914, + "step": 24320 + }, + { + "epoch": 1.591756624141315, + "grad_norm": 1.0479592084884644, + "learning_rate": 9.803028718767423e-05, + "loss": 0.1004, + "step": 24330 + }, + { + "epoch": 1.5924108603205758, + "grad_norm": 0.8851988911628723, + "learning_rate": 9.80277334679951e-05, + "loss": 0.0979, + "step": 24340 + }, + { + "epoch": 1.5930650964998363, + "grad_norm": 0.8954311609268188, + "learning_rate": 9.8025178127254e-05, + "loss": 0.0914, + "step": 24350 + }, + { + "epoch": 1.5937193326790973, + "grad_norm": 0.8873519897460938, + "learning_rate": 9.80226211655372e-05, + "loss": 0.0978, + "step": 24360 + }, + { + "epoch": 1.5943735688583578, + "grad_norm": 0.8193051815032959, + "learning_rate": 9.8020062582931e-05, + "loss": 0.1028, + "step": 24370 + }, + { + "epoch": 1.5950278050376185, + "grad_norm": 0.8333913683891296, + "learning_rate": 9.801750237952172e-05, + "loss": 0.0951, + "step": 24380 + }, + { + "epoch": 1.5956820412168793, + "grad_norm": 0.8090634942054749, + "learning_rate": 9.801494055539584e-05, + "loss": 0.1018, + "step": 24390 + }, + { + "epoch": 1.59633627739614, + "grad_norm": 0.8224121332168579, + "learning_rate": 9.801237711063978e-05, + "loss": 0.099, + "step": 24400 + }, + { + "epoch": 1.5969905135754008, + "grad_norm": 1.034574031829834, + "learning_rate": 9.800981204534006e-05, + "loss": 0.1089, + "step": 24410 + }, + { + "epoch": 1.5976447497546613, + "grad_norm": 0.8645883202552795, + "learning_rate": 9.800724535958328e-05, + "loss": 0.094, + "step": 24420 + }, + { + "epoch": 1.5982989859339223, + "grad_norm": 0.8233097195625305, + "learning_rate": 9.800467705345607e-05, + "loss": 0.097, + "step": 24430 + }, + { + "epoch": 1.5989532221131828, + "grad_norm": 0.9189668893814087, + "learning_rate": 9.800210712704512e-05, + "loss": 0.1041, + "step": 24440 + }, + { + "epoch": 1.5996074582924436, + "grad_norm": 0.8542932271957397, + "learning_rate": 9.799953558043715e-05, + "loss": 0.0997, + "step": 24450 + }, + { + "epoch": 1.6002616944717043, + "grad_norm": 0.7710226774215698, + "learning_rate": 9.799696241371898e-05, + "loss": 0.1006, + "step": 24460 + }, + { + "epoch": 1.600915930650965, + "grad_norm": 0.937626302242279, + "learning_rate": 9.799438762697744e-05, + "loss": 0.1013, + "step": 24470 + }, + { + "epoch": 1.6015701668302258, + "grad_norm": 0.8287016153335571, + "learning_rate": 9.799181122029946e-05, + "loss": 0.0906, + "step": 24480 + }, + { + "epoch": 1.6022244030094863, + "grad_norm": 0.8471201062202454, + "learning_rate": 9.798923319377199e-05, + "loss": 0.0886, + "step": 24490 + }, + { + "epoch": 1.6028786391887473, + "grad_norm": 0.8005667924880981, + "learning_rate": 9.798665354748205e-05, + "loss": 0.0949, + "step": 24500 + }, + { + "epoch": 1.6035328753680078, + "grad_norm": 0.7234973311424255, + "learning_rate": 9.798407228151667e-05, + "loss": 0.0955, + "step": 24510 + }, + { + "epoch": 1.6041871115472686, + "grad_norm": 0.8595584630966187, + "learning_rate": 9.798148939596303e-05, + "loss": 0.0885, + "step": 24520 + }, + { + "epoch": 1.6048413477265293, + "grad_norm": 0.8628395199775696, + "learning_rate": 9.797890489090829e-05, + "loss": 0.0917, + "step": 24530 + }, + { + "epoch": 1.6054955839057898, + "grad_norm": 0.698773980140686, + "learning_rate": 9.797631876643967e-05, + "loss": 0.0888, + "step": 24540 + }, + { + "epoch": 1.6061498200850508, + "grad_norm": 0.7893082499504089, + "learning_rate": 9.797373102264448e-05, + "loss": 0.108, + "step": 24550 + }, + { + "epoch": 1.6068040562643113, + "grad_norm": 0.9154890775680542, + "learning_rate": 9.797114165961006e-05, + "loss": 0.1043, + "step": 24560 + }, + { + "epoch": 1.607458292443572, + "grad_norm": 0.9815983176231384, + "learning_rate": 9.796855067742378e-05, + "loss": 0.1072, + "step": 24570 + }, + { + "epoch": 1.6081125286228328, + "grad_norm": 0.8936287760734558, + "learning_rate": 9.796595807617313e-05, + "loss": 0.1025, + "step": 24580 + }, + { + "epoch": 1.6087667648020936, + "grad_norm": 0.953900933265686, + "learning_rate": 9.796336385594557e-05, + "loss": 0.0932, + "step": 24590 + }, + { + "epoch": 1.6094210009813543, + "grad_norm": 0.8990422487258911, + "learning_rate": 9.796076801682871e-05, + "loss": 0.0958, + "step": 24600 + }, + { + "epoch": 1.6100752371606148, + "grad_norm": 0.7732602953910828, + "learning_rate": 9.795817055891016e-05, + "loss": 0.0885, + "step": 24610 + }, + { + "epoch": 1.6107294733398758, + "grad_norm": 0.913487434387207, + "learning_rate": 9.795557148227756e-05, + "loss": 0.1008, + "step": 24620 + }, + { + "epoch": 1.6113837095191363, + "grad_norm": 0.9833926558494568, + "learning_rate": 9.795297078701867e-05, + "loss": 0.0969, + "step": 24630 + }, + { + "epoch": 1.612037945698397, + "grad_norm": 0.9888587594032288, + "learning_rate": 9.795036847322124e-05, + "loss": 0.1004, + "step": 24640 + }, + { + "epoch": 1.6126921818776578, + "grad_norm": 0.7544565796852112, + "learning_rate": 9.794776454097314e-05, + "loss": 0.0883, + "step": 24650 + }, + { + "epoch": 1.6133464180569186, + "grad_norm": 0.8678017258644104, + "learning_rate": 9.794515899036222e-05, + "loss": 0.0974, + "step": 24660 + }, + { + "epoch": 1.6140006542361793, + "grad_norm": 1.1111491918563843, + "learning_rate": 9.794255182147644e-05, + "loss": 0.0926, + "step": 24670 + }, + { + "epoch": 1.6146548904154399, + "grad_norm": 0.8813068270683289, + "learning_rate": 9.793994303440382e-05, + "loss": 0.0952, + "step": 24680 + }, + { + "epoch": 1.6153091265947008, + "grad_norm": 0.8787200450897217, + "learning_rate": 9.793733262923238e-05, + "loss": 0.1085, + "step": 24690 + }, + { + "epoch": 1.6159633627739614, + "grad_norm": 0.7039420008659363, + "learning_rate": 9.793472060605024e-05, + "loss": 0.099, + "step": 24700 + }, + { + "epoch": 1.616617598953222, + "grad_norm": 0.7518815994262695, + "learning_rate": 9.793210696494559e-05, + "loss": 0.0981, + "step": 24710 + }, + { + "epoch": 1.6172718351324828, + "grad_norm": 0.8269907832145691, + "learning_rate": 9.79294917060066e-05, + "loss": 0.089, + "step": 24720 + }, + { + "epoch": 1.6179260713117434, + "grad_norm": 0.8977356553077698, + "learning_rate": 9.792687482932158e-05, + "loss": 0.0868, + "step": 24730 + }, + { + "epoch": 1.6185803074910043, + "grad_norm": 0.7739750742912292, + "learning_rate": 9.792425633497883e-05, + "loss": 0.0873, + "step": 24740 + }, + { + "epoch": 1.6192345436702649, + "grad_norm": 0.9570137858390808, + "learning_rate": 9.792163622306676e-05, + "loss": 0.091, + "step": 24750 + }, + { + "epoch": 1.6198887798495258, + "grad_norm": 0.817414402961731, + "learning_rate": 9.791901449367378e-05, + "loss": 0.0953, + "step": 24760 + }, + { + "epoch": 1.6205430160287864, + "grad_norm": 0.7275998592376709, + "learning_rate": 9.791639114688837e-05, + "loss": 0.1121, + "step": 24770 + }, + { + "epoch": 1.6211972522080471, + "grad_norm": 0.828654944896698, + "learning_rate": 9.791376618279913e-05, + "loss": 0.0982, + "step": 24780 + }, + { + "epoch": 1.6218514883873079, + "grad_norm": 1.096555471420288, + "learning_rate": 9.791113960149458e-05, + "loss": 0.0948, + "step": 24790 + }, + { + "epoch": 1.6225057245665684, + "grad_norm": 0.7662184834480286, + "learning_rate": 9.790851140306345e-05, + "loss": 0.0966, + "step": 24800 + }, + { + "epoch": 1.6231599607458294, + "grad_norm": 0.7542035579681396, + "learning_rate": 9.790588158759441e-05, + "loss": 0.1002, + "step": 24810 + }, + { + "epoch": 1.6238141969250899, + "grad_norm": 0.8306244015693665, + "learning_rate": 9.790325015517622e-05, + "loss": 0.0923, + "step": 24820 + }, + { + "epoch": 1.6244684331043506, + "grad_norm": 0.8431245684623718, + "learning_rate": 9.790061710589771e-05, + "loss": 0.092, + "step": 24830 + }, + { + "epoch": 1.6251226692836114, + "grad_norm": 1.4137808084487915, + "learning_rate": 9.789798243984775e-05, + "loss": 0.0905, + "step": 24840 + }, + { + "epoch": 1.6257769054628721, + "grad_norm": 0.9978521466255188, + "learning_rate": 9.789534615711527e-05, + "loss": 0.0955, + "step": 24850 + }, + { + "epoch": 1.6264311416421329, + "grad_norm": 0.766831636428833, + "learning_rate": 9.789270825778923e-05, + "loss": 0.0896, + "step": 24860 + }, + { + "epoch": 1.6270853778213934, + "grad_norm": 0.8945890665054321, + "learning_rate": 9.78900687419587e-05, + "loss": 0.0953, + "step": 24870 + }, + { + "epoch": 1.6277396140006544, + "grad_norm": 0.9509278535842896, + "learning_rate": 9.788742760971274e-05, + "loss": 0.0944, + "step": 24880 + }, + { + "epoch": 1.628393850179915, + "grad_norm": 0.752797544002533, + "learning_rate": 9.788478486114052e-05, + "loss": 0.0847, + "step": 24890 + }, + { + "epoch": 1.6290480863591756, + "grad_norm": 0.8730521202087402, + "learning_rate": 9.788214049633123e-05, + "loss": 0.0901, + "step": 24900 + }, + { + "epoch": 1.6297023225384364, + "grad_norm": 0.9180911183357239, + "learning_rate": 9.78794945153741e-05, + "loss": 0.0958, + "step": 24910 + }, + { + "epoch": 1.6303565587176971, + "grad_norm": 0.9941650629043579, + "learning_rate": 9.787684691835849e-05, + "loss": 0.0961, + "step": 24920 + }, + { + "epoch": 1.6310107948969579, + "grad_norm": 0.9088178873062134, + "learning_rate": 9.787419770537371e-05, + "loss": 0.1026, + "step": 24930 + }, + { + "epoch": 1.6316650310762184, + "grad_norm": 0.8117355108261108, + "learning_rate": 9.787154687650923e-05, + "loss": 0.0913, + "step": 24940 + }, + { + "epoch": 1.6323192672554794, + "grad_norm": 0.7052477598190308, + "learning_rate": 9.786889443185449e-05, + "loss": 0.0911, + "step": 24950 + }, + { + "epoch": 1.63297350343474, + "grad_norm": 0.8779917359352112, + "learning_rate": 9.7866240371499e-05, + "loss": 0.0997, + "step": 24960 + }, + { + "epoch": 1.6336277396140007, + "grad_norm": 0.7817448377609253, + "learning_rate": 9.786358469553238e-05, + "loss": 0.0967, + "step": 24970 + }, + { + "epoch": 1.6342819757932614, + "grad_norm": 0.8375545144081116, + "learning_rate": 9.786092740404424e-05, + "loss": 0.1043, + "step": 24980 + }, + { + "epoch": 1.634936211972522, + "grad_norm": 1.144739031791687, + "learning_rate": 9.78582684971243e-05, + "loss": 0.1022, + "step": 24990 + }, + { + "epoch": 1.635590448151783, + "grad_norm": 1.2085695266723633, + "learning_rate": 9.785560797486227e-05, + "loss": 0.101, + "step": 25000 + }, + { + "epoch": 1.6362446843310434, + "grad_norm": 0.6930639147758484, + "learning_rate": 9.785294583734796e-05, + "loss": 0.0947, + "step": 25010 + }, + { + "epoch": 1.6368989205103042, + "grad_norm": 0.921447217464447, + "learning_rate": 9.785028208467123e-05, + "loss": 0.0978, + "step": 25020 + }, + { + "epoch": 1.637553156689565, + "grad_norm": 1.0130419731140137, + "learning_rate": 9.784761671692202e-05, + "loss": 0.0903, + "step": 25030 + }, + { + "epoch": 1.6382073928688257, + "grad_norm": 0.7007921934127808, + "learning_rate": 9.784494973419022e-05, + "loss": 0.0962, + "step": 25040 + }, + { + "epoch": 1.6388616290480864, + "grad_norm": 0.7728266716003418, + "learning_rate": 9.784228113656591e-05, + "loss": 0.0994, + "step": 25050 + }, + { + "epoch": 1.639515865227347, + "grad_norm": 0.8407819867134094, + "learning_rate": 9.783961092413914e-05, + "loss": 0.089, + "step": 25060 + }, + { + "epoch": 1.640170101406608, + "grad_norm": 0.9866244792938232, + "learning_rate": 9.783693909700002e-05, + "loss": 0.0996, + "step": 25070 + }, + { + "epoch": 1.6408243375858684, + "grad_norm": 0.7183135151863098, + "learning_rate": 9.783426565523877e-05, + "loss": 0.0873, + "step": 25080 + }, + { + "epoch": 1.6414785737651292, + "grad_norm": 0.886570394039154, + "learning_rate": 9.78315905989456e-05, + "loss": 0.1016, + "step": 25090 + }, + { + "epoch": 1.64213280994439, + "grad_norm": 0.887607753276825, + "learning_rate": 9.782891392821078e-05, + "loss": 0.0904, + "step": 25100 + }, + { + "epoch": 1.6427870461236507, + "grad_norm": 0.9125217795372009, + "learning_rate": 9.78262356431247e-05, + "loss": 0.0913, + "step": 25110 + }, + { + "epoch": 1.6434412823029114, + "grad_norm": 0.9861580729484558, + "learning_rate": 9.782355574377775e-05, + "loss": 0.0996, + "step": 25120 + }, + { + "epoch": 1.644095518482172, + "grad_norm": 1.0295411348342896, + "learning_rate": 9.782087423026036e-05, + "loss": 0.0995, + "step": 25130 + }, + { + "epoch": 1.644749754661433, + "grad_norm": 0.7599952816963196, + "learning_rate": 9.781819110266304e-05, + "loss": 0.0966, + "step": 25140 + }, + { + "epoch": 1.6454039908406934, + "grad_norm": 0.7582678198814392, + "learning_rate": 9.781550636107637e-05, + "loss": 0.0931, + "step": 25150 + }, + { + "epoch": 1.6460582270199542, + "grad_norm": 0.9556160569190979, + "learning_rate": 9.781282000559095e-05, + "loss": 0.1033, + "step": 25160 + }, + { + "epoch": 1.646712463199215, + "grad_norm": 0.8655904531478882, + "learning_rate": 9.781013203629748e-05, + "loss": 0.1, + "step": 25170 + }, + { + "epoch": 1.6473666993784757, + "grad_norm": 0.7407214045524597, + "learning_rate": 9.780744245328666e-05, + "loss": 0.0947, + "step": 25180 + }, + { + "epoch": 1.6480209355577364, + "grad_norm": 0.6862826943397522, + "learning_rate": 9.780475125664927e-05, + "loss": 0.0812, + "step": 25190 + }, + { + "epoch": 1.648675171736997, + "grad_norm": 0.862034022808075, + "learning_rate": 9.780205844647616e-05, + "loss": 0.0928, + "step": 25200 + }, + { + "epoch": 1.649329407916258, + "grad_norm": 0.8481500744819641, + "learning_rate": 9.77993640228582e-05, + "loss": 0.0957, + "step": 25210 + }, + { + "epoch": 1.6499836440955185, + "grad_norm": 1.0111854076385498, + "learning_rate": 9.779666798588637e-05, + "loss": 0.0879, + "step": 25220 + }, + { + "epoch": 1.6506378802747792, + "grad_norm": 0.6928382515907288, + "learning_rate": 9.779397033565164e-05, + "loss": 0.0917, + "step": 25230 + }, + { + "epoch": 1.65129211645404, + "grad_norm": 0.8151880502700806, + "learning_rate": 9.779127107224505e-05, + "loss": 0.1028, + "step": 25240 + }, + { + "epoch": 1.6519463526333005, + "grad_norm": 0.8269695043563843, + "learning_rate": 9.778857019575774e-05, + "loss": 0.1064, + "step": 25250 + }, + { + "epoch": 1.6526005888125614, + "grad_norm": 0.8382137417793274, + "learning_rate": 9.778586770628084e-05, + "loss": 0.0939, + "step": 25260 + }, + { + "epoch": 1.653254824991822, + "grad_norm": 0.8290400505065918, + "learning_rate": 9.778316360390558e-05, + "loss": 0.0899, + "step": 25270 + }, + { + "epoch": 1.6539090611710827, + "grad_norm": 0.8005163073539734, + "learning_rate": 9.778045788872324e-05, + "loss": 0.0892, + "step": 25280 + }, + { + "epoch": 1.6545632973503435, + "grad_norm": 0.9442570209503174, + "learning_rate": 9.777775056082514e-05, + "loss": 0.1007, + "step": 25290 + }, + { + "epoch": 1.6552175335296042, + "grad_norm": 0.7411683797836304, + "learning_rate": 9.777504162030267e-05, + "loss": 0.1008, + "step": 25300 + }, + { + "epoch": 1.655871769708865, + "grad_norm": 0.8269376158714294, + "learning_rate": 9.777233106724722e-05, + "loss": 0.0943, + "step": 25310 + }, + { + "epoch": 1.6565260058881255, + "grad_norm": 0.6999377608299255, + "learning_rate": 9.776961890175034e-05, + "loss": 0.106, + "step": 25320 + }, + { + "epoch": 1.6571802420673865, + "grad_norm": 0.8918529152870178, + "learning_rate": 9.776690512390352e-05, + "loss": 0.0957, + "step": 25330 + }, + { + "epoch": 1.657834478246647, + "grad_norm": 0.7899738550186157, + "learning_rate": 9.776418973379838e-05, + "loss": 0.0943, + "step": 25340 + }, + { + "epoch": 1.6584887144259077, + "grad_norm": 0.7438578009605408, + "learning_rate": 9.776147273152659e-05, + "loss": 0.0977, + "step": 25350 + }, + { + "epoch": 1.6591429506051685, + "grad_norm": 0.7523559331893921, + "learning_rate": 9.77587541171798e-05, + "loss": 0.0979, + "step": 25360 + }, + { + "epoch": 1.6597971867844292, + "grad_norm": 0.8759346008300781, + "learning_rate": 9.775603389084985e-05, + "loss": 0.1003, + "step": 25370 + }, + { + "epoch": 1.66045142296369, + "grad_norm": 0.7944019436836243, + "learning_rate": 9.775331205262847e-05, + "loss": 0.0903, + "step": 25380 + }, + { + "epoch": 1.6611056591429505, + "grad_norm": 0.809095561504364, + "learning_rate": 9.775058860260759e-05, + "loss": 0.1002, + "step": 25390 + }, + { + "epoch": 1.6617598953222115, + "grad_norm": 0.8237670660018921, + "learning_rate": 9.774786354087913e-05, + "loss": 0.0944, + "step": 25400 + }, + { + "epoch": 1.662414131501472, + "grad_norm": 0.8633749485015869, + "learning_rate": 9.774513686753504e-05, + "loss": 0.1029, + "step": 25410 + }, + { + "epoch": 1.6630683676807327, + "grad_norm": 0.7542330622673035, + "learning_rate": 9.774240858266735e-05, + "loss": 0.0886, + "step": 25420 + }, + { + "epoch": 1.6637226038599935, + "grad_norm": 0.8325693607330322, + "learning_rate": 9.773967868636818e-05, + "loss": 0.0929, + "step": 25430 + }, + { + "epoch": 1.664376840039254, + "grad_norm": 0.9114111065864563, + "learning_rate": 9.773694717872963e-05, + "loss": 0.0922, + "step": 25440 + }, + { + "epoch": 1.665031076218515, + "grad_norm": 0.732631266117096, + "learning_rate": 9.773421405984394e-05, + "loss": 0.0999, + "step": 25450 + }, + { + "epoch": 1.6656853123977755, + "grad_norm": 0.7207270860671997, + "learning_rate": 9.773147932980334e-05, + "loss": 0.0991, + "step": 25460 + }, + { + "epoch": 1.6663395485770363, + "grad_norm": 0.7336596250534058, + "learning_rate": 9.772874298870012e-05, + "loss": 0.096, + "step": 25470 + }, + { + "epoch": 1.666993784756297, + "grad_norm": 0.9117507338523865, + "learning_rate": 9.772600503662665e-05, + "loss": 0.0961, + "step": 25480 + }, + { + "epoch": 1.6676480209355578, + "grad_norm": 0.7073248028755188, + "learning_rate": 9.772326547367534e-05, + "loss": 0.0996, + "step": 25490 + }, + { + "epoch": 1.6683022571148185, + "grad_norm": 1.0052919387817383, + "learning_rate": 9.772052429993868e-05, + "loss": 0.1011, + "step": 25500 + }, + { + "epoch": 1.668956493294079, + "grad_norm": 0.8882834911346436, + "learning_rate": 9.771778151550917e-05, + "loss": 0.0876, + "step": 25510 + }, + { + "epoch": 1.66961072947334, + "grad_norm": 0.8214176893234253, + "learning_rate": 9.771503712047937e-05, + "loss": 0.0875, + "step": 25520 + }, + { + "epoch": 1.6702649656526005, + "grad_norm": 0.9225919842720032, + "learning_rate": 9.771229111494194e-05, + "loss": 0.0981, + "step": 25530 + }, + { + "epoch": 1.6709192018318613, + "grad_norm": 0.8596377968788147, + "learning_rate": 9.770954349898956e-05, + "loss": 0.0932, + "step": 25540 + }, + { + "epoch": 1.671573438011122, + "grad_norm": 0.7050160765647888, + "learning_rate": 9.770679427271496e-05, + "loss": 0.1051, + "step": 25550 + }, + { + "epoch": 1.6722276741903828, + "grad_norm": 1.0160956382751465, + "learning_rate": 9.770404343621094e-05, + "loss": 0.1, + "step": 25560 + }, + { + "epoch": 1.6728819103696435, + "grad_norm": 0.7766900062561035, + "learning_rate": 9.770129098957035e-05, + "loss": 0.0909, + "step": 25570 + }, + { + "epoch": 1.673536146548904, + "grad_norm": 0.9387828707695007, + "learning_rate": 9.769853693288608e-05, + "loss": 0.1045, + "step": 25580 + }, + { + "epoch": 1.674190382728165, + "grad_norm": 1.0153820514678955, + "learning_rate": 9.76957812662511e-05, + "loss": 0.0992, + "step": 25590 + }, + { + "epoch": 1.6748446189074255, + "grad_norm": 0.7925774455070496, + "learning_rate": 9.769302398975841e-05, + "loss": 0.0978, + "step": 25600 + }, + { + "epoch": 1.6754988550866863, + "grad_norm": 1.1735292673110962, + "learning_rate": 9.769026510350108e-05, + "loss": 0.1079, + "step": 25610 + }, + { + "epoch": 1.676153091265947, + "grad_norm": 0.7813097834587097, + "learning_rate": 9.768750460757223e-05, + "loss": 0.0903, + "step": 25620 + }, + { + "epoch": 1.6768073274452078, + "grad_norm": 0.8915982842445374, + "learning_rate": 9.768474250206504e-05, + "loss": 0.0922, + "step": 25630 + }, + { + "epoch": 1.6774615636244685, + "grad_norm": 0.8092440366744995, + "learning_rate": 9.768197878707273e-05, + "loss": 0.0983, + "step": 25640 + }, + { + "epoch": 1.678115799803729, + "grad_norm": 0.9614975452423096, + "learning_rate": 9.767921346268858e-05, + "loss": 0.103, + "step": 25650 + }, + { + "epoch": 1.67877003598299, + "grad_norm": 0.6998571753501892, + "learning_rate": 9.767644652900594e-05, + "loss": 0.0918, + "step": 25660 + }, + { + "epoch": 1.6794242721622505, + "grad_norm": 0.7789784073829651, + "learning_rate": 9.76736779861182e-05, + "loss": 0.0903, + "step": 25670 + }, + { + "epoch": 1.6800785083415113, + "grad_norm": 0.7573550939559937, + "learning_rate": 9.767090783411878e-05, + "loss": 0.0869, + "step": 25680 + }, + { + "epoch": 1.680732744520772, + "grad_norm": 0.8103246688842773, + "learning_rate": 9.766813607310122e-05, + "loss": 0.0959, + "step": 25690 + }, + { + "epoch": 1.6813869807000326, + "grad_norm": 0.7668115496635437, + "learning_rate": 9.766536270315903e-05, + "loss": 0.1045, + "step": 25700 + }, + { + "epoch": 1.6820412168792935, + "grad_norm": 0.7344299554824829, + "learning_rate": 9.766258772438586e-05, + "loss": 0.098, + "step": 25710 + }, + { + "epoch": 1.682695453058554, + "grad_norm": 0.9814144968986511, + "learning_rate": 9.765981113687534e-05, + "loss": 0.1032, + "step": 25720 + }, + { + "epoch": 1.6833496892378148, + "grad_norm": 0.9255467653274536, + "learning_rate": 9.765703294072121e-05, + "loss": 0.0991, + "step": 25730 + }, + { + "epoch": 1.6840039254170756, + "grad_norm": 0.8468577861785889, + "learning_rate": 9.765425313601724e-05, + "loss": 0.0978, + "step": 25740 + }, + { + "epoch": 1.6846581615963363, + "grad_norm": 0.8852717876434326, + "learning_rate": 9.765147172285725e-05, + "loss": 0.1103, + "step": 25750 + }, + { + "epoch": 1.685312397775597, + "grad_norm": 0.6971980333328247, + "learning_rate": 9.764868870133511e-05, + "loss": 0.0939, + "step": 25760 + }, + { + "epoch": 1.6859666339548576, + "grad_norm": 0.9403474926948547, + "learning_rate": 9.764590407154476e-05, + "loss": 0.1042, + "step": 25770 + }, + { + "epoch": 1.6866208701341185, + "grad_norm": 0.8653038740158081, + "learning_rate": 9.76431178335802e-05, + "loss": 0.097, + "step": 25780 + }, + { + "epoch": 1.687275106313379, + "grad_norm": 0.8561285138130188, + "learning_rate": 9.764032998753547e-05, + "loss": 0.0911, + "step": 25790 + }, + { + "epoch": 1.6879293424926398, + "grad_norm": 0.7791575789451599, + "learning_rate": 9.763754053350465e-05, + "loss": 0.0927, + "step": 25800 + }, + { + "epoch": 1.6885835786719006, + "grad_norm": 0.7109609842300415, + "learning_rate": 9.76347494715819e-05, + "loss": 0.0879, + "step": 25810 + }, + { + "epoch": 1.6892378148511613, + "grad_norm": 0.7509888410568237, + "learning_rate": 9.763195680186143e-05, + "loss": 0.0846, + "step": 25820 + }, + { + "epoch": 1.689892051030422, + "grad_norm": 0.727888286113739, + "learning_rate": 9.762916252443751e-05, + "loss": 0.0913, + "step": 25830 + }, + { + "epoch": 1.6905462872096826, + "grad_norm": 0.8703082203865051, + "learning_rate": 9.762636663940443e-05, + "loss": 0.0902, + "step": 25840 + }, + { + "epoch": 1.6912005233889436, + "grad_norm": 0.8051765561103821, + "learning_rate": 9.762356914685658e-05, + "loss": 0.093, + "step": 25850 + }, + { + "epoch": 1.691854759568204, + "grad_norm": 0.8503502607345581, + "learning_rate": 9.762077004688836e-05, + "loss": 0.0913, + "step": 25860 + }, + { + "epoch": 1.6925089957474648, + "grad_norm": 0.7812398672103882, + "learning_rate": 9.761796933959428e-05, + "loss": 0.0876, + "step": 25870 + }, + { + "epoch": 1.6931632319267256, + "grad_norm": 0.8080005049705505, + "learning_rate": 9.761516702506886e-05, + "loss": 0.0872, + "step": 25880 + }, + { + "epoch": 1.693817468105986, + "grad_norm": 0.7798691391944885, + "learning_rate": 9.761236310340665e-05, + "loss": 0.0895, + "step": 25890 + }, + { + "epoch": 1.694471704285247, + "grad_norm": 0.7314333915710449, + "learning_rate": 9.760955757470233e-05, + "loss": 0.0925, + "step": 25900 + }, + { + "epoch": 1.6951259404645076, + "grad_norm": 0.8456811904907227, + "learning_rate": 9.760675043905058e-05, + "loss": 0.1005, + "step": 25910 + }, + { + "epoch": 1.6957801766437686, + "grad_norm": 0.6989087462425232, + "learning_rate": 9.760394169654615e-05, + "loss": 0.106, + "step": 25920 + }, + { + "epoch": 1.696434412823029, + "grad_norm": 1.0372560024261475, + "learning_rate": 9.760113134728384e-05, + "loss": 0.0865, + "step": 25930 + }, + { + "epoch": 1.6970886490022898, + "grad_norm": 1.2391959428787231, + "learning_rate": 9.75983193913585e-05, + "loss": 0.1008, + "step": 25940 + }, + { + "epoch": 1.6977428851815506, + "grad_norm": 0.8071995973587036, + "learning_rate": 9.759550582886506e-05, + "loss": 0.094, + "step": 25950 + }, + { + "epoch": 1.6983971213608111, + "grad_norm": 0.8598625063896179, + "learning_rate": 9.759269065989848e-05, + "loss": 0.0938, + "step": 25960 + }, + { + "epoch": 1.699051357540072, + "grad_norm": 0.853850245475769, + "learning_rate": 9.758987388455377e-05, + "loss": 0.087, + "step": 25970 + }, + { + "epoch": 1.6997055937193326, + "grad_norm": 0.8352051973342896, + "learning_rate": 9.7587055502926e-05, + "loss": 0.0959, + "step": 25980 + }, + { + "epoch": 1.7003598298985934, + "grad_norm": 0.7932831048965454, + "learning_rate": 9.758423551511031e-05, + "loss": 0.0852, + "step": 25990 + }, + { + "epoch": 1.701014066077854, + "grad_norm": 0.8113059997558594, + "learning_rate": 9.758141392120188e-05, + "loss": 0.099, + "step": 26000 + }, + { + "epoch": 1.7016683022571149, + "grad_norm": 0.9581121802330017, + "learning_rate": 9.757859072129594e-05, + "loss": 0.0961, + "step": 26010 + }, + { + "epoch": 1.7023225384363756, + "grad_norm": 0.897538423538208, + "learning_rate": 9.757576591548778e-05, + "loss": 0.0892, + "step": 26020 + }, + { + "epoch": 1.7029767746156361, + "grad_norm": 1.0049359798431396, + "learning_rate": 9.757293950387275e-05, + "loss": 0.1075, + "step": 26030 + }, + { + "epoch": 1.703631010794897, + "grad_norm": 1.0087964534759521, + "learning_rate": 9.757011148654625e-05, + "loss": 0.1066, + "step": 26040 + }, + { + "epoch": 1.7042852469741576, + "grad_norm": 0.9875262379646301, + "learning_rate": 9.756728186360373e-05, + "loss": 0.0973, + "step": 26050 + }, + { + "epoch": 1.7049394831534184, + "grad_norm": 0.837603747844696, + "learning_rate": 9.75644506351407e-05, + "loss": 0.1014, + "step": 26060 + }, + { + "epoch": 1.7055937193326791, + "grad_norm": 0.7701033353805542, + "learning_rate": 9.756161780125271e-05, + "loss": 0.0923, + "step": 26070 + }, + { + "epoch": 1.7062479555119399, + "grad_norm": 0.8201101422309875, + "learning_rate": 9.755878336203539e-05, + "loss": 0.1013, + "step": 26080 + }, + { + "epoch": 1.7069021916912006, + "grad_norm": 0.8958601951599121, + "learning_rate": 9.755594731758441e-05, + "loss": 0.0874, + "step": 26090 + }, + { + "epoch": 1.7075564278704611, + "grad_norm": 0.8118382692337036, + "learning_rate": 9.755310966799546e-05, + "loss": 0.0847, + "step": 26100 + }, + { + "epoch": 1.708210664049722, + "grad_norm": 0.8663161993026733, + "learning_rate": 9.755027041336439e-05, + "loss": 0.0967, + "step": 26110 + }, + { + "epoch": 1.7088649002289826, + "grad_norm": 0.7478710412979126, + "learning_rate": 9.754742955378696e-05, + "loss": 0.0861, + "step": 26120 + }, + { + "epoch": 1.7095191364082434, + "grad_norm": 0.9378357529640198, + "learning_rate": 9.75445870893591e-05, + "loss": 0.0999, + "step": 26130 + }, + { + "epoch": 1.7101733725875041, + "grad_norm": 0.8898491263389587, + "learning_rate": 9.754174302017671e-05, + "loss": 0.096, + "step": 26140 + }, + { + "epoch": 1.7108276087667647, + "grad_norm": 0.9357932806015015, + "learning_rate": 9.753889734633583e-05, + "loss": 0.0884, + "step": 26150 + }, + { + "epoch": 1.7114818449460256, + "grad_norm": 0.7866037487983704, + "learning_rate": 9.753605006793249e-05, + "loss": 0.0864, + "step": 26160 + }, + { + "epoch": 1.7121360811252861, + "grad_norm": 0.938848614692688, + "learning_rate": 9.75332011850628e-05, + "loss": 0.0993, + "step": 26170 + }, + { + "epoch": 1.712790317304547, + "grad_norm": 0.7685279846191406, + "learning_rate": 9.753035069782288e-05, + "loss": 0.0905, + "step": 26180 + }, + { + "epoch": 1.7134445534838076, + "grad_norm": 0.7881633043289185, + "learning_rate": 9.7527498606309e-05, + "loss": 0.0956, + "step": 26190 + }, + { + "epoch": 1.7140987896630684, + "grad_norm": 0.8515254259109497, + "learning_rate": 9.752464491061738e-05, + "loss": 0.0971, + "step": 26200 + }, + { + "epoch": 1.7147530258423291, + "grad_norm": 1.0867912769317627, + "learning_rate": 9.752178961084438e-05, + "loss": 0.1, + "step": 26210 + }, + { + "epoch": 1.7154072620215897, + "grad_norm": 0.8622691631317139, + "learning_rate": 9.751893270708631e-05, + "loss": 0.0981, + "step": 26220 + }, + { + "epoch": 1.7160614982008506, + "grad_norm": 0.8179900050163269, + "learning_rate": 9.751607419943966e-05, + "loss": 0.0891, + "step": 26230 + }, + { + "epoch": 1.7167157343801112, + "grad_norm": 0.9989860653877258, + "learning_rate": 9.75132140880009e-05, + "loss": 0.0906, + "step": 26240 + }, + { + "epoch": 1.717369970559372, + "grad_norm": 0.8796525597572327, + "learning_rate": 9.751035237286654e-05, + "loss": 0.0915, + "step": 26250 + }, + { + "epoch": 1.7180242067386327, + "grad_norm": 0.7103739380836487, + "learning_rate": 9.750748905413321e-05, + "loss": 0.0898, + "step": 26260 + }, + { + "epoch": 1.7186784429178934, + "grad_norm": 0.8540217876434326, + "learning_rate": 9.75046241318975e-05, + "loss": 0.0971, + "step": 26270 + }, + { + "epoch": 1.7193326790971541, + "grad_norm": 0.8470193147659302, + "learning_rate": 9.750175760625616e-05, + "loss": 0.0885, + "step": 26280 + }, + { + "epoch": 1.7199869152764147, + "grad_norm": 0.8234091401100159, + "learning_rate": 9.749888947730592e-05, + "loss": 0.0924, + "step": 26290 + }, + { + "epoch": 1.7206411514556756, + "grad_norm": 0.7980528473854065, + "learning_rate": 9.749601974514358e-05, + "loss": 0.0907, + "step": 26300 + }, + { + "epoch": 1.7212953876349362, + "grad_norm": 0.8894267082214355, + "learning_rate": 9.749314840986603e-05, + "loss": 0.0888, + "step": 26310 + }, + { + "epoch": 1.721949623814197, + "grad_norm": 0.9575929641723633, + "learning_rate": 9.749027547157015e-05, + "loss": 0.098, + "step": 26320 + }, + { + "epoch": 1.7226038599934577, + "grad_norm": 0.9381182193756104, + "learning_rate": 9.748740093035293e-05, + "loss": 0.0851, + "step": 26330 + }, + { + "epoch": 1.7232580961727182, + "grad_norm": 0.8991712927818298, + "learning_rate": 9.748452478631139e-05, + "loss": 0.1015, + "step": 26340 + }, + { + "epoch": 1.7239123323519792, + "grad_norm": 0.9150116443634033, + "learning_rate": 9.74816470395426e-05, + "loss": 0.095, + "step": 26350 + }, + { + "epoch": 1.7245665685312397, + "grad_norm": 0.9800270199775696, + "learning_rate": 9.74787676901437e-05, + "loss": 0.0929, + "step": 26360 + }, + { + "epoch": 1.7252208047105007, + "grad_norm": 0.8880261182785034, + "learning_rate": 9.747588673821187e-05, + "loss": 0.0904, + "step": 26370 + }, + { + "epoch": 1.7258750408897612, + "grad_norm": 0.8214205503463745, + "learning_rate": 9.747300418384436e-05, + "loss": 0.0933, + "step": 26380 + }, + { + "epoch": 1.726529277069022, + "grad_norm": 0.9119507074356079, + "learning_rate": 9.747012002713846e-05, + "loss": 0.0881, + "step": 26390 + }, + { + "epoch": 1.7271835132482827, + "grad_norm": 0.8520674109458923, + "learning_rate": 9.746723426819151e-05, + "loss": 0.0915, + "step": 26400 + }, + { + "epoch": 1.7278377494275432, + "grad_norm": 0.836610734462738, + "learning_rate": 9.74643469071009e-05, + "loss": 0.0977, + "step": 26410 + }, + { + "epoch": 1.7284919856068042, + "grad_norm": 1.0091043710708618, + "learning_rate": 9.746145794396412e-05, + "loss": 0.0978, + "step": 26420 + }, + { + "epoch": 1.7291462217860647, + "grad_norm": 0.77135169506073, + "learning_rate": 9.745856737887866e-05, + "loss": 0.099, + "step": 26430 + }, + { + "epoch": 1.7298004579653254, + "grad_norm": 0.9269153475761414, + "learning_rate": 9.745567521194207e-05, + "loss": 0.0883, + "step": 26440 + }, + { + "epoch": 1.7304546941445862, + "grad_norm": 0.8424230813980103, + "learning_rate": 9.7452781443252e-05, + "loss": 0.0935, + "step": 26450 + }, + { + "epoch": 1.731108930323847, + "grad_norm": 0.7595318555831909, + "learning_rate": 9.744988607290611e-05, + "loss": 0.0949, + "step": 26460 + }, + { + "epoch": 1.7317631665031077, + "grad_norm": 0.7194068431854248, + "learning_rate": 9.744698910100211e-05, + "loss": 0.099, + "step": 26470 + }, + { + "epoch": 1.7324174026823682, + "grad_norm": 0.8426297307014465, + "learning_rate": 9.74440905276378e-05, + "loss": 0.0878, + "step": 26480 + }, + { + "epoch": 1.7330716388616292, + "grad_norm": 0.8402136564254761, + "learning_rate": 9.744119035291101e-05, + "loss": 0.094, + "step": 26490 + }, + { + "epoch": 1.7337258750408897, + "grad_norm": 0.9715792536735535, + "learning_rate": 9.743828857691963e-05, + "loss": 0.094, + "step": 26500 + }, + { + "epoch": 1.7343801112201505, + "grad_norm": 0.8805036544799805, + "learning_rate": 9.74353851997616e-05, + "loss": 0.1009, + "step": 26510 + }, + { + "epoch": 1.7350343473994112, + "grad_norm": 0.9216516017913818, + "learning_rate": 9.743248022153491e-05, + "loss": 0.0903, + "step": 26520 + }, + { + "epoch": 1.735688583578672, + "grad_norm": 0.7972573041915894, + "learning_rate": 9.742957364233763e-05, + "loss": 0.089, + "step": 26530 + }, + { + "epoch": 1.7363428197579327, + "grad_norm": 0.8343783617019653, + "learning_rate": 9.742666546226784e-05, + "loss": 0.082, + "step": 26540 + }, + { + "epoch": 1.7369970559371932, + "grad_norm": 1.03281831741333, + "learning_rate": 9.74237556814237e-05, + "loss": 0.0882, + "step": 26550 + }, + { + "epoch": 1.7376512921164542, + "grad_norm": 0.9120453596115112, + "learning_rate": 9.742084429990344e-05, + "loss": 0.0942, + "step": 26560 + }, + { + "epoch": 1.7383055282957147, + "grad_norm": 0.752260684967041, + "learning_rate": 9.741793131780532e-05, + "loss": 0.105, + "step": 26570 + }, + { + "epoch": 1.7389597644749755, + "grad_norm": 0.8960766792297363, + "learning_rate": 9.741501673522767e-05, + "loss": 0.1, + "step": 26580 + }, + { + "epoch": 1.7396140006542362, + "grad_norm": 0.9580662250518799, + "learning_rate": 9.741210055226883e-05, + "loss": 0.0972, + "step": 26590 + }, + { + "epoch": 1.7402682368334967, + "grad_norm": 0.8709340691566467, + "learning_rate": 9.740918276902726e-05, + "loss": 0.0925, + "step": 26600 + }, + { + "epoch": 1.7409224730127577, + "grad_norm": 0.8603937029838562, + "learning_rate": 9.740626338560146e-05, + "loss": 0.099, + "step": 26610 + }, + { + "epoch": 1.7415767091920182, + "grad_norm": 0.8637518286705017, + "learning_rate": 9.740334240208992e-05, + "loss": 0.1037, + "step": 26620 + }, + { + "epoch": 1.742230945371279, + "grad_norm": 0.954017698764801, + "learning_rate": 9.740041981859126e-05, + "loss": 0.0986, + "step": 26630 + }, + { + "epoch": 1.7428851815505397, + "grad_norm": 0.9584331512451172, + "learning_rate": 9.739749563520413e-05, + "loss": 0.0905, + "step": 26640 + }, + { + "epoch": 1.7435394177298005, + "grad_norm": 0.8306182026863098, + "learning_rate": 9.73945698520272e-05, + "loss": 0.0921, + "step": 26650 + }, + { + "epoch": 1.7441936539090612, + "grad_norm": 0.9190852642059326, + "learning_rate": 9.739164246915926e-05, + "loss": 0.0971, + "step": 26660 + }, + { + "epoch": 1.7448478900883218, + "grad_norm": 0.8570190072059631, + "learning_rate": 9.738871348669907e-05, + "loss": 0.0914, + "step": 26670 + }, + { + "epoch": 1.7455021262675827, + "grad_norm": 0.7864238023757935, + "learning_rate": 9.738578290474554e-05, + "loss": 0.0974, + "step": 26680 + }, + { + "epoch": 1.7461563624468432, + "grad_norm": 0.9038723707199097, + "learning_rate": 9.738285072339755e-05, + "loss": 0.0914, + "step": 26690 + }, + { + "epoch": 1.746810598626104, + "grad_norm": 1.1634936332702637, + "learning_rate": 9.73799169427541e-05, + "loss": 0.0965, + "step": 26700 + }, + { + "epoch": 1.7474648348053647, + "grad_norm": 0.830987811088562, + "learning_rate": 9.737698156291418e-05, + "loss": 0.1034, + "step": 26710 + }, + { + "epoch": 1.7481190709846255, + "grad_norm": 0.7684716582298279, + "learning_rate": 9.737404458397688e-05, + "loss": 0.0894, + "step": 26720 + }, + { + "epoch": 1.7487733071638862, + "grad_norm": 1.0796003341674805, + "learning_rate": 9.737110600604135e-05, + "loss": 0.1027, + "step": 26730 + }, + { + "epoch": 1.7494275433431468, + "grad_norm": 0.8072198629379272, + "learning_rate": 9.736816582920674e-05, + "loss": 0.0956, + "step": 26740 + }, + { + "epoch": 1.7500817795224077, + "grad_norm": 0.9745908379554749, + "learning_rate": 9.736522405357231e-05, + "loss": 0.0923, + "step": 26750 + }, + { + "epoch": 1.7507360157016683, + "grad_norm": 0.9096083641052246, + "learning_rate": 9.736228067923735e-05, + "loss": 0.0863, + "step": 26760 + }, + { + "epoch": 1.751390251880929, + "grad_norm": 0.7566020488739014, + "learning_rate": 9.73593357063012e-05, + "loss": 0.0921, + "step": 26770 + }, + { + "epoch": 1.7520444880601898, + "grad_norm": 0.9019480347633362, + "learning_rate": 9.735638913486327e-05, + "loss": 0.0888, + "step": 26780 + }, + { + "epoch": 1.7526987242394503, + "grad_norm": 0.8336036205291748, + "learning_rate": 9.735344096502302e-05, + "loss": 0.0952, + "step": 26790 + }, + { + "epoch": 1.7533529604187112, + "grad_norm": 0.8614187240600586, + "learning_rate": 9.735049119687993e-05, + "loss": 0.0946, + "step": 26800 + }, + { + "epoch": 1.7540071965979718, + "grad_norm": 0.9070919752120972, + "learning_rate": 9.73475398305336e-05, + "loss": 0.0853, + "step": 26810 + }, + { + "epoch": 1.7546614327772327, + "grad_norm": 0.8516311049461365, + "learning_rate": 9.734458686608361e-05, + "loss": 0.0972, + "step": 26820 + }, + { + "epoch": 1.7553156689564933, + "grad_norm": 1.2225861549377441, + "learning_rate": 9.734163230362965e-05, + "loss": 0.1026, + "step": 26830 + }, + { + "epoch": 1.755969905135754, + "grad_norm": 0.8379483222961426, + "learning_rate": 9.733867614327145e-05, + "loss": 0.0914, + "step": 26840 + }, + { + "epoch": 1.7566241413150148, + "grad_norm": 1.0506983995437622, + "learning_rate": 9.733571838510878e-05, + "loss": 0.089, + "step": 26850 + }, + { + "epoch": 1.7572783774942753, + "grad_norm": 0.8693654537200928, + "learning_rate": 9.733275902924146e-05, + "loss": 0.1042, + "step": 26860 + }, + { + "epoch": 1.7579326136735363, + "grad_norm": 0.8979597091674805, + "learning_rate": 9.732979807576941e-05, + "loss": 0.0866, + "step": 26870 + }, + { + "epoch": 1.7585868498527968, + "grad_norm": 1.0074806213378906, + "learning_rate": 9.732683552479252e-05, + "loss": 0.1029, + "step": 26880 + }, + { + "epoch": 1.7592410860320575, + "grad_norm": 0.7297909259796143, + "learning_rate": 9.732387137641084e-05, + "loss": 0.1038, + "step": 26890 + }, + { + "epoch": 1.7598953222113183, + "grad_norm": 0.7585257887840271, + "learning_rate": 9.732090563072437e-05, + "loss": 0.1036, + "step": 26900 + }, + { + "epoch": 1.760549558390579, + "grad_norm": 0.7857770919799805, + "learning_rate": 9.731793828783323e-05, + "loss": 0.0912, + "step": 26910 + }, + { + "epoch": 1.7612037945698398, + "grad_norm": 0.7956094741821289, + "learning_rate": 9.731496934783759e-05, + "loss": 0.0953, + "step": 26920 + }, + { + "epoch": 1.7618580307491003, + "grad_norm": 0.9437527656555176, + "learning_rate": 9.731199881083763e-05, + "loss": 0.0901, + "step": 26930 + }, + { + "epoch": 1.7625122669283613, + "grad_norm": 0.733113169670105, + "learning_rate": 9.730902667693365e-05, + "loss": 0.0945, + "step": 26940 + }, + { + "epoch": 1.7631665031076218, + "grad_norm": 0.9366680383682251, + "learning_rate": 9.730605294622593e-05, + "loss": 0.1043, + "step": 26950 + }, + { + "epoch": 1.7638207392868825, + "grad_norm": 0.963205873966217, + "learning_rate": 9.730307761881487e-05, + "loss": 0.1127, + "step": 26960 + }, + { + "epoch": 1.7644749754661433, + "grad_norm": 0.8156423568725586, + "learning_rate": 9.730010069480088e-05, + "loss": 0.088, + "step": 26970 + }, + { + "epoch": 1.765129211645404, + "grad_norm": 1.1396329402923584, + "learning_rate": 9.729712217428444e-05, + "loss": 0.0834, + "step": 26980 + }, + { + "epoch": 1.7657834478246648, + "grad_norm": 0.7981417775154114, + "learning_rate": 9.72941420573661e-05, + "loss": 0.0932, + "step": 26990 + }, + { + "epoch": 1.7664376840039253, + "grad_norm": 0.8251798748970032, + "learning_rate": 9.729116034414641e-05, + "loss": 0.085, + "step": 27000 + }, + { + "epoch": 1.7670919201831863, + "grad_norm": 0.8225140571594238, + "learning_rate": 9.728817703472604e-05, + "loss": 0.0906, + "step": 27010 + }, + { + "epoch": 1.7677461563624468, + "grad_norm": 0.8406652808189392, + "learning_rate": 9.728519212920568e-05, + "loss": 0.0907, + "step": 27020 + }, + { + "epoch": 1.7684003925417076, + "grad_norm": 0.7829009294509888, + "learning_rate": 9.728220562768607e-05, + "loss": 0.0933, + "step": 27030 + }, + { + "epoch": 1.7690546287209683, + "grad_norm": 0.6848630309104919, + "learning_rate": 9.727921753026802e-05, + "loss": 0.0981, + "step": 27040 + }, + { + "epoch": 1.7697088649002288, + "grad_norm": 0.7441691756248474, + "learning_rate": 9.727622783705239e-05, + "loss": 0.0891, + "step": 27050 + }, + { + "epoch": 1.7703631010794898, + "grad_norm": 0.8913453221321106, + "learning_rate": 9.727323654814009e-05, + "loss": 0.0987, + "step": 27060 + }, + { + "epoch": 1.7710173372587503, + "grad_norm": 0.7993099093437195, + "learning_rate": 9.727024366363206e-05, + "loss": 0.0904, + "step": 27070 + }, + { + "epoch": 1.771671573438011, + "grad_norm": 0.821160078048706, + "learning_rate": 9.726724918362935e-05, + "loss": 0.0987, + "step": 27080 + }, + { + "epoch": 1.7723258096172718, + "grad_norm": 0.7259571552276611, + "learning_rate": 9.7264253108233e-05, + "loss": 0.0973, + "step": 27090 + }, + { + "epoch": 1.7729800457965326, + "grad_norm": 0.8779374361038208, + "learning_rate": 9.726125543754417e-05, + "loss": 0.0934, + "step": 27100 + }, + { + "epoch": 1.7736342819757933, + "grad_norm": 0.7990883588790894, + "learning_rate": 9.725825617166402e-05, + "loss": 0.0946, + "step": 27110 + }, + { + "epoch": 1.7742885181550538, + "grad_norm": 0.7716463804244995, + "learning_rate": 9.725525531069377e-05, + "loss": 0.0956, + "step": 27120 + }, + { + "epoch": 1.7749427543343148, + "grad_norm": 1.0061755180358887, + "learning_rate": 9.725225285473473e-05, + "loss": 0.0804, + "step": 27130 + }, + { + "epoch": 1.7755969905135753, + "grad_norm": 0.9990642070770264, + "learning_rate": 9.724924880388824e-05, + "loss": 0.1089, + "step": 27140 + }, + { + "epoch": 1.776251226692836, + "grad_norm": 0.8501982688903809, + "learning_rate": 9.72462431582557e-05, + "loss": 0.0938, + "step": 27150 + }, + { + "epoch": 1.7769054628720968, + "grad_norm": 0.8450808525085449, + "learning_rate": 9.724323591793851e-05, + "loss": 0.0846, + "step": 27160 + }, + { + "epoch": 1.7775596990513576, + "grad_norm": 0.6056027412414551, + "learning_rate": 9.724022708303824e-05, + "loss": 0.0928, + "step": 27170 + }, + { + "epoch": 1.7782139352306183, + "grad_norm": 0.7293991446495056, + "learning_rate": 9.723721665365639e-05, + "loss": 0.0927, + "step": 27180 + }, + { + "epoch": 1.7788681714098789, + "grad_norm": 0.8087413311004639, + "learning_rate": 9.723420462989461e-05, + "loss": 0.0866, + "step": 27190 + }, + { + "epoch": 1.7795224075891398, + "grad_norm": 0.9952290654182434, + "learning_rate": 9.723119101185455e-05, + "loss": 0.0868, + "step": 27200 + }, + { + "epoch": 1.7801766437684003, + "grad_norm": 0.6939850449562073, + "learning_rate": 9.722817579963789e-05, + "loss": 0.0953, + "step": 27210 + }, + { + "epoch": 1.780830879947661, + "grad_norm": 0.7775471806526184, + "learning_rate": 9.722515899334647e-05, + "loss": 0.0985, + "step": 27220 + }, + { + "epoch": 1.7814851161269218, + "grad_norm": 0.824198842048645, + "learning_rate": 9.722214059308208e-05, + "loss": 0.0992, + "step": 27230 + }, + { + "epoch": 1.7821393523061824, + "grad_norm": 0.8829124569892883, + "learning_rate": 9.72191205989466e-05, + "loss": 0.095, + "step": 27240 + }, + { + "epoch": 1.7827935884854433, + "grad_norm": 0.8177944421768188, + "learning_rate": 9.721609901104194e-05, + "loss": 0.094, + "step": 27250 + }, + { + "epoch": 1.7834478246647039, + "grad_norm": 1.004304051399231, + "learning_rate": 9.721307582947014e-05, + "loss": 0.091, + "step": 27260 + }, + { + "epoch": 1.7841020608439648, + "grad_norm": 0.8551913499832153, + "learning_rate": 9.721005105433319e-05, + "loss": 0.0965, + "step": 27270 + }, + { + "epoch": 1.7847562970232254, + "grad_norm": 0.7581825256347656, + "learning_rate": 9.720702468573321e-05, + "loss": 0.0934, + "step": 27280 + }, + { + "epoch": 1.785410533202486, + "grad_norm": 0.761549711227417, + "learning_rate": 9.720399672377234e-05, + "loss": 0.0978, + "step": 27290 + }, + { + "epoch": 1.7860647693817469, + "grad_norm": 0.8848084211349487, + "learning_rate": 9.72009671685528e-05, + "loss": 0.0811, + "step": 27300 + }, + { + "epoch": 1.7867190055610074, + "grad_norm": 0.8144980669021606, + "learning_rate": 9.719793602017681e-05, + "loss": 0.0844, + "step": 27310 + }, + { + "epoch": 1.7873732417402683, + "grad_norm": 0.8796505928039551, + "learning_rate": 9.71949032787467e-05, + "loss": 0.0904, + "step": 27320 + }, + { + "epoch": 1.7880274779195289, + "grad_norm": 0.8658892512321472, + "learning_rate": 9.719186894436484e-05, + "loss": 0.0914, + "step": 27330 + }, + { + "epoch": 1.7886817140987896, + "grad_norm": 0.9171237349510193, + "learning_rate": 9.718883301713363e-05, + "loss": 0.0923, + "step": 27340 + }, + { + "epoch": 1.7893359502780504, + "grad_norm": 0.7952425479888916, + "learning_rate": 9.718579549715555e-05, + "loss": 0.1069, + "step": 27350 + }, + { + "epoch": 1.7899901864573111, + "grad_norm": 0.8469095230102539, + "learning_rate": 9.718275638453312e-05, + "loss": 0.0942, + "step": 27360 + }, + { + "epoch": 1.7906444226365719, + "grad_norm": 0.8001088500022888, + "learning_rate": 9.717971567936892e-05, + "loss": 0.0931, + "step": 27370 + }, + { + "epoch": 1.7912986588158324, + "grad_norm": 0.9924861192703247, + "learning_rate": 9.71766733817656e-05, + "loss": 0.0924, + "step": 27380 + }, + { + "epoch": 1.7919528949950934, + "grad_norm": 0.79388028383255, + "learning_rate": 9.71736294918258e-05, + "loss": 0.0949, + "step": 27390 + }, + { + "epoch": 1.7926071311743539, + "grad_norm": 0.8403041362762451, + "learning_rate": 9.71705840096523e-05, + "loss": 0.084, + "step": 27400 + }, + { + "epoch": 1.7932613673536146, + "grad_norm": 0.9250532984733582, + "learning_rate": 9.716753693534791e-05, + "loss": 0.092, + "step": 27410 + }, + { + "epoch": 1.7939156035328754, + "grad_norm": 0.8198657035827637, + "learning_rate": 9.716448826901541e-05, + "loss": 0.096, + "step": 27420 + }, + { + "epoch": 1.7945698397121361, + "grad_norm": 0.7525361180305481, + "learning_rate": 9.716143801075775e-05, + "loss": 0.0925, + "step": 27430 + }, + { + "epoch": 1.7952240758913969, + "grad_norm": 0.7295049428939819, + "learning_rate": 9.715838616067786e-05, + "loss": 0.0912, + "step": 27440 + }, + { + "epoch": 1.7958783120706574, + "grad_norm": 0.9026070833206177, + "learning_rate": 9.715533271887876e-05, + "loss": 0.0941, + "step": 27450 + }, + { + "epoch": 1.7965325482499184, + "grad_norm": 0.9286572933197021, + "learning_rate": 9.715227768546354e-05, + "loss": 0.0895, + "step": 27460 + }, + { + "epoch": 1.797186784429179, + "grad_norm": 0.6571982502937317, + "learning_rate": 9.714922106053526e-05, + "loss": 0.0912, + "step": 27470 + }, + { + "epoch": 1.7978410206084396, + "grad_norm": 0.7618511915206909, + "learning_rate": 9.71461628441971e-05, + "loss": 0.0883, + "step": 27480 + }, + { + "epoch": 1.7984952567877004, + "grad_norm": 0.8936156630516052, + "learning_rate": 9.714310303655234e-05, + "loss": 0.0926, + "step": 27490 + }, + { + "epoch": 1.799149492966961, + "grad_norm": 0.7239774465560913, + "learning_rate": 9.71400416377042e-05, + "loss": 0.0846, + "step": 27500 + }, + { + "epoch": 1.7998037291462219, + "grad_norm": 0.7614884376525879, + "learning_rate": 9.713697864775601e-05, + "loss": 0.1061, + "step": 27510 + }, + { + "epoch": 1.8004579653254824, + "grad_norm": 0.8885960578918457, + "learning_rate": 9.713391406681118e-05, + "loss": 0.0838, + "step": 27520 + }, + { + "epoch": 1.8011122015047432, + "grad_norm": 0.7284700274467468, + "learning_rate": 9.713084789497315e-05, + "loss": 0.0917, + "step": 27530 + }, + { + "epoch": 1.801766437684004, + "grad_norm": 0.7352108955383301, + "learning_rate": 9.712778013234538e-05, + "loss": 0.0833, + "step": 27540 + }, + { + "epoch": 1.8024206738632647, + "grad_norm": 0.7528089880943298, + "learning_rate": 9.712471077903144e-05, + "loss": 0.0889, + "step": 27550 + }, + { + "epoch": 1.8030749100425254, + "grad_norm": 0.895668625831604, + "learning_rate": 9.712163983513491e-05, + "loss": 0.0957, + "step": 27560 + }, + { + "epoch": 1.803729146221786, + "grad_norm": 0.7745876908302307, + "learning_rate": 9.711856730075948e-05, + "loss": 0.1034, + "step": 27570 + }, + { + "epoch": 1.804383382401047, + "grad_norm": 0.9422993659973145, + "learning_rate": 9.711549317600881e-05, + "loss": 0.0915, + "step": 27580 + }, + { + "epoch": 1.8050376185803074, + "grad_norm": 0.8201385140419006, + "learning_rate": 9.711241746098669e-05, + "loss": 0.092, + "step": 27590 + }, + { + "epoch": 1.8056918547595682, + "grad_norm": 0.8114073872566223, + "learning_rate": 9.710934015579693e-05, + "loss": 0.0939, + "step": 27600 + }, + { + "epoch": 1.806346090938829, + "grad_norm": 0.742363452911377, + "learning_rate": 9.710626126054338e-05, + "loss": 0.0994, + "step": 27610 + }, + { + "epoch": 1.8070003271180897, + "grad_norm": 0.9187309145927429, + "learning_rate": 9.710318077532998e-05, + "loss": 0.0937, + "step": 27620 + }, + { + "epoch": 1.8076545632973504, + "grad_norm": 0.9056969881057739, + "learning_rate": 9.71000987002607e-05, + "loss": 0.0975, + "step": 27630 + }, + { + "epoch": 1.808308799476611, + "grad_norm": 0.7385011911392212, + "learning_rate": 9.709701503543954e-05, + "loss": 0.1049, + "step": 27640 + }, + { + "epoch": 1.808963035655872, + "grad_norm": 0.8550733327865601, + "learning_rate": 9.709392978097061e-05, + "loss": 0.0927, + "step": 27650 + }, + { + "epoch": 1.8096172718351324, + "grad_norm": 0.8086677193641663, + "learning_rate": 9.709084293695806e-05, + "loss": 0.0882, + "step": 27660 + }, + { + "epoch": 1.8102715080143932, + "grad_norm": 0.8741077184677124, + "learning_rate": 9.708775450350605e-05, + "loss": 0.0907, + "step": 27670 + }, + { + "epoch": 1.810925744193654, + "grad_norm": 0.7944101095199585, + "learning_rate": 9.708466448071884e-05, + "loss": 0.084, + "step": 27680 + }, + { + "epoch": 1.8115799803729145, + "grad_norm": 0.9104844331741333, + "learning_rate": 9.708157286870072e-05, + "loss": 0.1128, + "step": 27690 + }, + { + "epoch": 1.8122342165521754, + "grad_norm": 0.8403039574623108, + "learning_rate": 9.707847966755604e-05, + "loss": 0.0835, + "step": 27700 + }, + { + "epoch": 1.812888452731436, + "grad_norm": 0.8909661769866943, + "learning_rate": 9.707538487738918e-05, + "loss": 0.0925, + "step": 27710 + }, + { + "epoch": 1.813542688910697, + "grad_norm": 0.8909949660301208, + "learning_rate": 9.707228849830465e-05, + "loss": 0.0944, + "step": 27720 + }, + { + "epoch": 1.8141969250899574, + "grad_norm": 1.029722809791565, + "learning_rate": 9.706919053040692e-05, + "loss": 0.1, + "step": 27730 + }, + { + "epoch": 1.8148511612692182, + "grad_norm": 0.7977635264396667, + "learning_rate": 9.706609097380058e-05, + "loss": 0.0894, + "step": 27740 + }, + { + "epoch": 1.815505397448479, + "grad_norm": 0.9570349454879761, + "learning_rate": 9.706298982859021e-05, + "loss": 0.0977, + "step": 27750 + }, + { + "epoch": 1.8161596336277395, + "grad_norm": 0.8739515542984009, + "learning_rate": 9.705988709488052e-05, + "loss": 0.1012, + "step": 27760 + }, + { + "epoch": 1.8168138698070004, + "grad_norm": 0.8953158259391785, + "learning_rate": 9.705678277277622e-05, + "loss": 0.1025, + "step": 27770 + }, + { + "epoch": 1.817468105986261, + "grad_norm": 0.8194689154624939, + "learning_rate": 9.70536768623821e-05, + "loss": 0.0959, + "step": 27780 + }, + { + "epoch": 1.8181223421655217, + "grad_norm": 1.037415623664856, + "learning_rate": 9.705056936380296e-05, + "loss": 0.0908, + "step": 27790 + }, + { + "epoch": 1.8187765783447825, + "grad_norm": 1.1118489503860474, + "learning_rate": 9.704746027714372e-05, + "loss": 0.0972, + "step": 27800 + }, + { + "epoch": 1.8194308145240432, + "grad_norm": 0.847061276435852, + "learning_rate": 9.704434960250931e-05, + "loss": 0.0857, + "step": 27810 + }, + { + "epoch": 1.820085050703304, + "grad_norm": 1.020371437072754, + "learning_rate": 9.704123734000473e-05, + "loss": 0.1029, + "step": 27820 + }, + { + "epoch": 1.8207392868825645, + "grad_norm": 1.1115399599075317, + "learning_rate": 9.703812348973501e-05, + "loss": 0.0988, + "step": 27830 + }, + { + "epoch": 1.8213935230618254, + "grad_norm": 0.8873423337936401, + "learning_rate": 9.703500805180527e-05, + "loss": 0.0971, + "step": 27840 + }, + { + "epoch": 1.822047759241086, + "grad_norm": 0.7394191026687622, + "learning_rate": 9.703189102632064e-05, + "loss": 0.0968, + "step": 27850 + }, + { + "epoch": 1.8227019954203467, + "grad_norm": 0.965347945690155, + "learning_rate": 9.702877241338635e-05, + "loss": 0.0818, + "step": 27860 + }, + { + "epoch": 1.8233562315996075, + "grad_norm": 0.8741346597671509, + "learning_rate": 9.702565221310766e-05, + "loss": 0.1045, + "step": 27870 + }, + { + "epoch": 1.8240104677788682, + "grad_norm": 0.7647256255149841, + "learning_rate": 9.702253042558986e-05, + "loss": 0.0899, + "step": 27880 + }, + { + "epoch": 1.824664703958129, + "grad_norm": 0.8359330296516418, + "learning_rate": 9.701940705093835e-05, + "loss": 0.095, + "step": 27890 + }, + { + "epoch": 1.8253189401373895, + "grad_norm": 0.9208669066429138, + "learning_rate": 9.701628208925855e-05, + "loss": 0.101, + "step": 27900 + }, + { + "epoch": 1.8259731763166505, + "grad_norm": 0.9185352325439453, + "learning_rate": 9.70131555406559e-05, + "loss": 0.0914, + "step": 27910 + }, + { + "epoch": 1.826627412495911, + "grad_norm": 0.8828587532043457, + "learning_rate": 9.701002740523597e-05, + "loss": 0.0881, + "step": 27920 + }, + { + "epoch": 1.8272816486751717, + "grad_norm": 0.8667250275611877, + "learning_rate": 9.700689768310434e-05, + "loss": 0.0945, + "step": 27930 + }, + { + "epoch": 1.8279358848544325, + "grad_norm": 0.9793092012405396, + "learning_rate": 9.700376637436662e-05, + "loss": 0.1026, + "step": 27940 + }, + { + "epoch": 1.828590121033693, + "grad_norm": 0.8025292158126831, + "learning_rate": 9.70006334791285e-05, + "loss": 0.0846, + "step": 27950 + }, + { + "epoch": 1.829244357212954, + "grad_norm": 0.9942945241928101, + "learning_rate": 9.699749899749576e-05, + "loss": 0.09, + "step": 27960 + }, + { + "epoch": 1.8298985933922145, + "grad_norm": 0.8808379769325256, + "learning_rate": 9.699436292957414e-05, + "loss": 0.0887, + "step": 27970 + }, + { + "epoch": 1.8305528295714752, + "grad_norm": 0.8378937840461731, + "learning_rate": 9.699122527546955e-05, + "loss": 0.0885, + "step": 27980 + }, + { + "epoch": 1.831207065750736, + "grad_norm": 0.7837172150611877, + "learning_rate": 9.698808603528786e-05, + "loss": 0.0812, + "step": 27990 + }, + { + "epoch": 1.8318613019299967, + "grad_norm": 0.7706719636917114, + "learning_rate": 9.698494520913503e-05, + "loss": 0.0887, + "step": 28000 + }, + { + "epoch": 1.8325155381092575, + "grad_norm": 1.1025103330612183, + "learning_rate": 9.69818027971171e-05, + "loss": 0.0896, + "step": 28010 + }, + { + "epoch": 1.833169774288518, + "grad_norm": 0.9141650795936584, + "learning_rate": 9.697865879934009e-05, + "loss": 0.1002, + "step": 28020 + }, + { + "epoch": 1.833824010467779, + "grad_norm": 0.7713013887405396, + "learning_rate": 9.697551321591014e-05, + "loss": 0.1056, + "step": 28030 + }, + { + "epoch": 1.8344782466470395, + "grad_norm": 0.8746879696846008, + "learning_rate": 9.697236604693343e-05, + "loss": 0.1024, + "step": 28040 + }, + { + "epoch": 1.8351324828263003, + "grad_norm": 0.8916308283805847, + "learning_rate": 9.696921729251617e-05, + "loss": 0.0951, + "step": 28050 + }, + { + "epoch": 1.835786719005561, + "grad_norm": 0.7855492830276489, + "learning_rate": 9.696606695276464e-05, + "loss": 0.0857, + "step": 28060 + }, + { + "epoch": 1.8364409551848218, + "grad_norm": 0.8872267603874207, + "learning_rate": 9.696291502778519e-05, + "loss": 0.0942, + "step": 28070 + }, + { + "epoch": 1.8370951913640825, + "grad_norm": 0.8778836727142334, + "learning_rate": 9.695976151768419e-05, + "loss": 0.0933, + "step": 28080 + }, + { + "epoch": 1.837749427543343, + "grad_norm": 0.8608387112617493, + "learning_rate": 9.695660642256807e-05, + "loss": 0.0811, + "step": 28090 + }, + { + "epoch": 1.838403663722604, + "grad_norm": 0.9035623073577881, + "learning_rate": 9.695344974254336e-05, + "loss": 0.1077, + "step": 28100 + }, + { + "epoch": 1.8390578999018645, + "grad_norm": 1.024272084236145, + "learning_rate": 9.695029147771655e-05, + "loss": 0.0941, + "step": 28110 + }, + { + "epoch": 1.8397121360811253, + "grad_norm": 0.9248595833778381, + "learning_rate": 9.69471316281943e-05, + "loss": 0.1043, + "step": 28120 + }, + { + "epoch": 1.840366372260386, + "grad_norm": 0.7749372720718384, + "learning_rate": 9.694397019408322e-05, + "loss": 0.0868, + "step": 28130 + }, + { + "epoch": 1.8410206084396468, + "grad_norm": 0.88482666015625, + "learning_rate": 9.694080717549004e-05, + "loss": 0.101, + "step": 28140 + }, + { + "epoch": 1.8416748446189075, + "grad_norm": 0.8212443590164185, + "learning_rate": 9.693764257252149e-05, + "loss": 0.0838, + "step": 28150 + }, + { + "epoch": 1.842329080798168, + "grad_norm": 0.7402942180633545, + "learning_rate": 9.693447638528443e-05, + "loss": 0.0853, + "step": 28160 + }, + { + "epoch": 1.842983316977429, + "grad_norm": 0.8029096126556396, + "learning_rate": 9.693130861388569e-05, + "loss": 0.081, + "step": 28170 + }, + { + "epoch": 1.8436375531566895, + "grad_norm": 0.9067233204841614, + "learning_rate": 9.69281392584322e-05, + "loss": 0.0928, + "step": 28180 + }, + { + "epoch": 1.8442917893359503, + "grad_norm": 0.959036111831665, + "learning_rate": 9.692496831903092e-05, + "loss": 0.0982, + "step": 28190 + }, + { + "epoch": 1.844946025515211, + "grad_norm": 1.1850526332855225, + "learning_rate": 9.692179579578893e-05, + "loss": 0.0981, + "step": 28200 + }, + { + "epoch": 1.8456002616944716, + "grad_norm": 0.9170256853103638, + "learning_rate": 9.691862168881325e-05, + "loss": 0.0931, + "step": 28210 + }, + { + "epoch": 1.8462544978737325, + "grad_norm": 0.894639790058136, + "learning_rate": 9.691544599821105e-05, + "loss": 0.0802, + "step": 28220 + }, + { + "epoch": 1.846908734052993, + "grad_norm": 0.8013020157814026, + "learning_rate": 9.69122687240895e-05, + "loss": 0.0937, + "step": 28230 + }, + { + "epoch": 1.8475629702322538, + "grad_norm": 0.93445885181427, + "learning_rate": 9.690908986655586e-05, + "loss": 0.0911, + "step": 28240 + }, + { + "epoch": 1.8482172064115145, + "grad_norm": 0.7632222771644592, + "learning_rate": 9.69059094257174e-05, + "loss": 0.0953, + "step": 28250 + }, + { + "epoch": 1.8488714425907753, + "grad_norm": 0.9063422083854675, + "learning_rate": 9.690272740168149e-05, + "loss": 0.0885, + "step": 28260 + }, + { + "epoch": 1.849525678770036, + "grad_norm": 0.652265191078186, + "learning_rate": 9.689954379455552e-05, + "loss": 0.0847, + "step": 28270 + }, + { + "epoch": 1.8501799149492966, + "grad_norm": 0.8725261688232422, + "learning_rate": 9.689635860444696e-05, + "loss": 0.092, + "step": 28280 + }, + { + "epoch": 1.8508341511285575, + "grad_norm": 0.7449600696563721, + "learning_rate": 9.689317183146329e-05, + "loss": 0.0878, + "step": 28290 + }, + { + "epoch": 1.851488387307818, + "grad_norm": 0.8442542552947998, + "learning_rate": 9.68899834757121e-05, + "loss": 0.0938, + "step": 28300 + }, + { + "epoch": 1.8521426234870788, + "grad_norm": 0.9524549841880798, + "learning_rate": 9.6886793537301e-05, + "loss": 0.1059, + "step": 28310 + }, + { + "epoch": 1.8527968596663396, + "grad_norm": 0.9726237654685974, + "learning_rate": 9.688360201633763e-05, + "loss": 0.0942, + "step": 28320 + }, + { + "epoch": 1.8534510958456003, + "grad_norm": 0.991133987903595, + "learning_rate": 9.688040891292976e-05, + "loss": 0.1008, + "step": 28330 + }, + { + "epoch": 1.854105332024861, + "grad_norm": 0.9573308229446411, + "learning_rate": 9.687721422718512e-05, + "loss": 0.0898, + "step": 28340 + }, + { + "epoch": 1.8547595682041216, + "grad_norm": 1.1355527639389038, + "learning_rate": 9.687401795921156e-05, + "loss": 0.095, + "step": 28350 + }, + { + "epoch": 1.8554138043833825, + "grad_norm": 0.7798793315887451, + "learning_rate": 9.687082010911698e-05, + "loss": 0.0883, + "step": 28360 + }, + { + "epoch": 1.856068040562643, + "grad_norm": 0.5980040431022644, + "learning_rate": 9.686762067700928e-05, + "loss": 0.0922, + "step": 28370 + }, + { + "epoch": 1.8567222767419038, + "grad_norm": 0.9269242286682129, + "learning_rate": 9.686441966299649e-05, + "loss": 0.0893, + "step": 28380 + }, + { + "epoch": 1.8573765129211646, + "grad_norm": 0.6925560235977173, + "learning_rate": 9.68612170671866e-05, + "loss": 0.0957, + "step": 28390 + }, + { + "epoch": 1.858030749100425, + "grad_norm": 0.8151103258132935, + "learning_rate": 9.685801288968777e-05, + "loss": 0.0925, + "step": 28400 + }, + { + "epoch": 1.858684985279686, + "grad_norm": 0.9811034798622131, + "learning_rate": 9.685480713060808e-05, + "loss": 0.1071, + "step": 28410 + }, + { + "epoch": 1.8593392214589466, + "grad_norm": 0.9472489953041077, + "learning_rate": 9.68515997900558e-05, + "loss": 0.089, + "step": 28420 + }, + { + "epoch": 1.8599934576382076, + "grad_norm": 0.8960433602333069, + "learning_rate": 9.684839086813913e-05, + "loss": 0.1004, + "step": 28430 + }, + { + "epoch": 1.860647693817468, + "grad_norm": 0.9435994625091553, + "learning_rate": 9.684518036496641e-05, + "loss": 0.0928, + "step": 28440 + }, + { + "epoch": 1.8613019299967288, + "grad_norm": 0.8257802128791809, + "learning_rate": 9.6841968280646e-05, + "loss": 0.0994, + "step": 28450 + }, + { + "epoch": 1.8619561661759896, + "grad_norm": 0.8301753997802734, + "learning_rate": 9.683875461528632e-05, + "loss": 0.086, + "step": 28460 + }, + { + "epoch": 1.86261040235525, + "grad_norm": 0.8830812573432922, + "learning_rate": 9.683553936899583e-05, + "loss": 0.0972, + "step": 28470 + }, + { + "epoch": 1.863264638534511, + "grad_norm": 0.7733142971992493, + "learning_rate": 9.683232254188305e-05, + "loss": 0.1017, + "step": 28480 + }, + { + "epoch": 1.8639188747137716, + "grad_norm": 0.7784489393234253, + "learning_rate": 9.682910413405657e-05, + "loss": 0.0959, + "step": 28490 + }, + { + "epoch": 1.8645731108930323, + "grad_norm": 0.9781941771507263, + "learning_rate": 9.6825884145625e-05, + "loss": 0.0977, + "step": 28500 + }, + { + "epoch": 1.865227347072293, + "grad_norm": 0.9205982685089111, + "learning_rate": 9.682266257669703e-05, + "loss": 0.1062, + "step": 28510 + }, + { + "epoch": 1.8658815832515538, + "grad_norm": 1.1955499649047852, + "learning_rate": 9.681943942738141e-05, + "loss": 0.1022, + "step": 28520 + }, + { + "epoch": 1.8665358194308146, + "grad_norm": 0.8939968943595886, + "learning_rate": 9.681621469778692e-05, + "loss": 0.0958, + "step": 28530 + }, + { + "epoch": 1.8671900556100751, + "grad_norm": 0.7560677528381348, + "learning_rate": 9.681298838802242e-05, + "loss": 0.0943, + "step": 28540 + }, + { + "epoch": 1.867844291789336, + "grad_norm": 0.8495007753372192, + "learning_rate": 9.680976049819677e-05, + "loss": 0.0946, + "step": 28550 + }, + { + "epoch": 1.8684985279685966, + "grad_norm": 0.8184399008750916, + "learning_rate": 9.680653102841895e-05, + "loss": 0.0948, + "step": 28560 + }, + { + "epoch": 1.8691527641478574, + "grad_norm": 1.0168190002441406, + "learning_rate": 9.680329997879795e-05, + "loss": 0.0905, + "step": 28570 + }, + { + "epoch": 1.869807000327118, + "grad_norm": 0.8939940929412842, + "learning_rate": 9.680006734944283e-05, + "loss": 0.0849, + "step": 28580 + }, + { + "epoch": 1.8704612365063789, + "grad_norm": 0.9259584546089172, + "learning_rate": 9.679683314046269e-05, + "loss": 0.0986, + "step": 28590 + }, + { + "epoch": 1.8711154726856396, + "grad_norm": 0.8003655076026917, + "learning_rate": 9.679359735196671e-05, + "loss": 0.087, + "step": 28600 + }, + { + "epoch": 1.8717697088649001, + "grad_norm": 0.8092601895332336, + "learning_rate": 9.67903599840641e-05, + "loss": 0.0937, + "step": 28610 + }, + { + "epoch": 1.872423945044161, + "grad_norm": 0.757056713104248, + "learning_rate": 9.678712103686413e-05, + "loss": 0.0857, + "step": 28620 + }, + { + "epoch": 1.8730781812234216, + "grad_norm": 0.903866171836853, + "learning_rate": 9.678388051047611e-05, + "loss": 0.0948, + "step": 28630 + }, + { + "epoch": 1.8737324174026824, + "grad_norm": 0.8220004439353943, + "learning_rate": 9.678063840500944e-05, + "loss": 0.0914, + "step": 28640 + }, + { + "epoch": 1.8743866535819431, + "grad_norm": 0.8988321423530579, + "learning_rate": 9.677739472057354e-05, + "loss": 0.0916, + "step": 28650 + }, + { + "epoch": 1.8750408897612036, + "grad_norm": 0.8908348679542542, + "learning_rate": 9.677414945727787e-05, + "loss": 0.0882, + "step": 28660 + }, + { + "epoch": 1.8756951259404646, + "grad_norm": 0.7631081938743591, + "learning_rate": 9.677090261523201e-05, + "loss": 0.0929, + "step": 28670 + }, + { + "epoch": 1.8763493621197251, + "grad_norm": 0.7455977201461792, + "learning_rate": 9.676765419454552e-05, + "loss": 0.0902, + "step": 28680 + }, + { + "epoch": 1.8770035982989859, + "grad_norm": 0.9186832904815674, + "learning_rate": 9.676440419532804e-05, + "loss": 0.0948, + "step": 28690 + }, + { + "epoch": 1.8776578344782466, + "grad_norm": 0.8906334042549133, + "learning_rate": 9.676115261768928e-05, + "loss": 0.0814, + "step": 28700 + }, + { + "epoch": 1.8783120706575074, + "grad_norm": 0.9550861716270447, + "learning_rate": 9.675789946173897e-05, + "loss": 0.0939, + "step": 28710 + }, + { + "epoch": 1.8789663068367681, + "grad_norm": 0.7002192735671997, + "learning_rate": 9.675464472758695e-05, + "loss": 0.0872, + "step": 28720 + }, + { + "epoch": 1.8796205430160287, + "grad_norm": 0.9470962882041931, + "learning_rate": 9.675138841534303e-05, + "loss": 0.0865, + "step": 28730 + }, + { + "epoch": 1.8802747791952896, + "grad_norm": 0.8719517588615417, + "learning_rate": 9.674813052511715e-05, + "loss": 0.0844, + "step": 28740 + }, + { + "epoch": 1.8809290153745502, + "grad_norm": 0.8722051978111267, + "learning_rate": 9.674487105701926e-05, + "loss": 0.0913, + "step": 28750 + }, + { + "epoch": 1.881583251553811, + "grad_norm": 0.8734009265899658, + "learning_rate": 9.674161001115938e-05, + "loss": 0.0878, + "step": 28760 + }, + { + "epoch": 1.8822374877330716, + "grad_norm": 0.8401185274124146, + "learning_rate": 9.673834738764759e-05, + "loss": 0.0889, + "step": 28770 + }, + { + "epoch": 1.8828917239123324, + "grad_norm": 0.8197746872901917, + "learning_rate": 9.6735083186594e-05, + "loss": 0.0991, + "step": 28780 + }, + { + "epoch": 1.8835459600915931, + "grad_norm": 0.8320461511611938, + "learning_rate": 9.673181740810876e-05, + "loss": 0.0823, + "step": 28790 + }, + { + "epoch": 1.8842001962708537, + "grad_norm": 0.9948931336402893, + "learning_rate": 9.672855005230214e-05, + "loss": 0.0862, + "step": 28800 + }, + { + "epoch": 1.8848544324501146, + "grad_norm": 0.8380158543586731, + "learning_rate": 9.67252811192844e-05, + "loss": 0.0835, + "step": 28810 + }, + { + "epoch": 1.8855086686293752, + "grad_norm": 0.8905105590820312, + "learning_rate": 9.672201060916589e-05, + "loss": 0.1009, + "step": 28820 + }, + { + "epoch": 1.886162904808636, + "grad_norm": 0.8555837869644165, + "learning_rate": 9.671873852205699e-05, + "loss": 0.0983, + "step": 28830 + }, + { + "epoch": 1.8868171409878967, + "grad_norm": 0.7890472412109375, + "learning_rate": 9.671546485806813e-05, + "loss": 0.102, + "step": 28840 + }, + { + "epoch": 1.8874713771671572, + "grad_norm": 0.8302340507507324, + "learning_rate": 9.671218961730981e-05, + "loss": 0.1047, + "step": 28850 + }, + { + "epoch": 1.8881256133464182, + "grad_norm": 0.912526547908783, + "learning_rate": 9.670891279989261e-05, + "loss": 0.0886, + "step": 28860 + }, + { + "epoch": 1.8887798495256787, + "grad_norm": 0.9973543882369995, + "learning_rate": 9.670563440592709e-05, + "loss": 0.0974, + "step": 28870 + }, + { + "epoch": 1.8894340857049396, + "grad_norm": 0.8430191874504089, + "learning_rate": 9.670235443552391e-05, + "loss": 0.084, + "step": 28880 + }, + { + "epoch": 1.8900883218842002, + "grad_norm": 0.7398623824119568, + "learning_rate": 9.669907288879379e-05, + "loss": 0.0874, + "step": 28890 + }, + { + "epoch": 1.890742558063461, + "grad_norm": 0.8663063049316406, + "learning_rate": 9.669578976584748e-05, + "loss": 0.0852, + "step": 28900 + }, + { + "epoch": 1.8913967942427217, + "grad_norm": 0.8032066226005554, + "learning_rate": 9.669250506679582e-05, + "loss": 0.1002, + "step": 28910 + }, + { + "epoch": 1.8920510304219822, + "grad_norm": 0.7833549976348877, + "learning_rate": 9.668921879174965e-05, + "loss": 0.0991, + "step": 28920 + }, + { + "epoch": 1.8927052666012432, + "grad_norm": 0.8108584880828857, + "learning_rate": 9.66859309408199e-05, + "loss": 0.0876, + "step": 28930 + }, + { + "epoch": 1.8933595027805037, + "grad_norm": 0.6968421936035156, + "learning_rate": 9.668264151411755e-05, + "loss": 0.098, + "step": 28940 + }, + { + "epoch": 1.8940137389597644, + "grad_norm": 1.000785231590271, + "learning_rate": 9.66793505117536e-05, + "loss": 0.0879, + "step": 28950 + }, + { + "epoch": 1.8946679751390252, + "grad_norm": 0.9198935627937317, + "learning_rate": 9.667605793383916e-05, + "loss": 0.1009, + "step": 28960 + }, + { + "epoch": 1.895322211318286, + "grad_norm": 0.9679718613624573, + "learning_rate": 9.667276378048535e-05, + "loss": 0.0987, + "step": 28970 + }, + { + "epoch": 1.8959764474975467, + "grad_norm": 0.9554332494735718, + "learning_rate": 9.666946805180336e-05, + "loss": 0.101, + "step": 28980 + }, + { + "epoch": 1.8966306836768072, + "grad_norm": 0.9619019031524658, + "learning_rate": 9.666617074790442e-05, + "loss": 0.103, + "step": 28990 + }, + { + "epoch": 1.8972849198560682, + "grad_norm": 0.8431410789489746, + "learning_rate": 9.666287186889983e-05, + "loss": 0.097, + "step": 29000 + }, + { + "epoch": 1.8979391560353287, + "grad_norm": 0.9269760847091675, + "learning_rate": 9.665957141490096e-05, + "loss": 0.0859, + "step": 29010 + }, + { + "epoch": 1.8985933922145894, + "grad_norm": 0.9166405200958252, + "learning_rate": 9.665626938601917e-05, + "loss": 0.0887, + "step": 29020 + }, + { + "epoch": 1.8992476283938502, + "grad_norm": 0.973930835723877, + "learning_rate": 9.665296578236593e-05, + "loss": 0.0897, + "step": 29030 + }, + { + "epoch": 1.899901864573111, + "grad_norm": 0.9470407366752625, + "learning_rate": 9.664966060405275e-05, + "loss": 0.0925, + "step": 29040 + }, + { + "epoch": 1.9005561007523717, + "grad_norm": 0.762370228767395, + "learning_rate": 9.664635385119117e-05, + "loss": 0.0958, + "step": 29050 + }, + { + "epoch": 1.9012103369316322, + "grad_norm": 0.8602021932601929, + "learning_rate": 9.66430455238928e-05, + "loss": 0.091, + "step": 29060 + }, + { + "epoch": 1.9018645731108932, + "grad_norm": 0.8808605074882507, + "learning_rate": 9.663973562226934e-05, + "loss": 0.0837, + "step": 29070 + }, + { + "epoch": 1.9025188092901537, + "grad_norm": 0.7062970399856567, + "learning_rate": 9.663642414643248e-05, + "loss": 0.0916, + "step": 29080 + }, + { + "epoch": 1.9031730454694145, + "grad_norm": 0.7167387008666992, + "learning_rate": 9.6633111096494e-05, + "loss": 0.0866, + "step": 29090 + }, + { + "epoch": 1.9038272816486752, + "grad_norm": 0.91849684715271, + "learning_rate": 9.662979647256572e-05, + "loss": 0.0882, + "step": 29100 + }, + { + "epoch": 1.9044815178279357, + "grad_norm": 0.7310143709182739, + "learning_rate": 9.662648027475952e-05, + "loss": 0.0947, + "step": 29110 + }, + { + "epoch": 1.9051357540071967, + "grad_norm": 0.7814351916313171, + "learning_rate": 9.66231625031873e-05, + "loss": 0.102, + "step": 29120 + }, + { + "epoch": 1.9057899901864572, + "grad_norm": 1.0291553735733032, + "learning_rate": 9.661984315796111e-05, + "loss": 0.0924, + "step": 29130 + }, + { + "epoch": 1.906444226365718, + "grad_norm": 0.6880286335945129, + "learning_rate": 9.661652223919293e-05, + "loss": 0.0787, + "step": 29140 + }, + { + "epoch": 1.9070984625449787, + "grad_norm": 0.7895125150680542, + "learning_rate": 9.661319974699487e-05, + "loss": 0.0845, + "step": 29150 + }, + { + "epoch": 1.9077526987242395, + "grad_norm": 0.846792995929718, + "learning_rate": 9.660987568147907e-05, + "loss": 0.0884, + "step": 29160 + }, + { + "epoch": 1.9084069349035002, + "grad_norm": 1.0212069749832153, + "learning_rate": 9.660655004275772e-05, + "loss": 0.0912, + "step": 29170 + }, + { + "epoch": 1.9090611710827607, + "grad_norm": 0.8728976845741272, + "learning_rate": 9.660322283094309e-05, + "loss": 0.0873, + "step": 29180 + }, + { + "epoch": 1.9097154072620217, + "grad_norm": 0.9119880199432373, + "learning_rate": 9.659989404614746e-05, + "loss": 0.0921, + "step": 29190 + }, + { + "epoch": 1.9103696434412822, + "grad_norm": 0.7712875008583069, + "learning_rate": 9.65965636884832e-05, + "loss": 0.1135, + "step": 29200 + }, + { + "epoch": 1.911023879620543, + "grad_norm": 0.8132479786872864, + "learning_rate": 9.659323175806271e-05, + "loss": 0.0848, + "step": 29210 + }, + { + "epoch": 1.9116781157998037, + "grad_norm": 1.06707763671875, + "learning_rate": 9.658989825499845e-05, + "loss": 0.0922, + "step": 29220 + }, + { + "epoch": 1.9123323519790645, + "grad_norm": 0.9323616623878479, + "learning_rate": 9.658656317940293e-05, + "loss": 0.1036, + "step": 29230 + }, + { + "epoch": 1.9129865881583252, + "grad_norm": 0.8813486099243164, + "learning_rate": 9.658322653138873e-05, + "loss": 0.096, + "step": 29240 + }, + { + "epoch": 1.9136408243375858, + "grad_norm": 0.9012136459350586, + "learning_rate": 9.657988831106847e-05, + "loss": 0.0916, + "step": 29250 + }, + { + "epoch": 1.9142950605168467, + "grad_norm": 0.8024969696998596, + "learning_rate": 9.657654851855483e-05, + "loss": 0.0926, + "step": 29260 + }, + { + "epoch": 1.9149492966961073, + "grad_norm": 0.7480851411819458, + "learning_rate": 9.657320715396051e-05, + "loss": 0.0975, + "step": 29270 + }, + { + "epoch": 1.915603532875368, + "grad_norm": 0.8564775586128235, + "learning_rate": 9.65698642173983e-05, + "loss": 0.0857, + "step": 29280 + }, + { + "epoch": 1.9162577690546287, + "grad_norm": 0.79481440782547, + "learning_rate": 9.656651970898105e-05, + "loss": 0.0852, + "step": 29290 + }, + { + "epoch": 1.9169120052338893, + "grad_norm": 0.8546467423439026, + "learning_rate": 9.656317362882164e-05, + "loss": 0.0975, + "step": 29300 + }, + { + "epoch": 1.9175662414131502, + "grad_norm": 0.8550596833229065, + "learning_rate": 9.6559825977033e-05, + "loss": 0.0848, + "step": 29310 + }, + { + "epoch": 1.9182204775924108, + "grad_norm": 0.8139176368713379, + "learning_rate": 9.655647675372812e-05, + "loss": 0.0843, + "step": 29320 + }, + { + "epoch": 1.9188747137716717, + "grad_norm": 0.847602128982544, + "learning_rate": 9.655312595902004e-05, + "loss": 0.0865, + "step": 29330 + }, + { + "epoch": 1.9195289499509323, + "grad_norm": 0.7845019698143005, + "learning_rate": 9.654977359302189e-05, + "loss": 0.0959, + "step": 29340 + }, + { + "epoch": 1.920183186130193, + "grad_norm": 0.8975633382797241, + "learning_rate": 9.654641965584678e-05, + "loss": 0.0842, + "step": 29350 + }, + { + "epoch": 1.9208374223094538, + "grad_norm": 0.872686505317688, + "learning_rate": 9.654306414760796e-05, + "loss": 0.0976, + "step": 29360 + }, + { + "epoch": 1.9214916584887143, + "grad_norm": 1.081772804260254, + "learning_rate": 9.653970706841864e-05, + "loss": 0.0854, + "step": 29370 + }, + { + "epoch": 1.9221458946679753, + "grad_norm": 0.8015065789222717, + "learning_rate": 9.653634841839216e-05, + "loss": 0.0988, + "step": 29380 + }, + { + "epoch": 1.9228001308472358, + "grad_norm": 0.8011831045150757, + "learning_rate": 9.653298819764187e-05, + "loss": 0.0838, + "step": 29390 + }, + { + "epoch": 1.9234543670264965, + "grad_norm": 0.8473259210586548, + "learning_rate": 9.65296264062812e-05, + "loss": 0.0897, + "step": 29400 + }, + { + "epoch": 1.9241086032057573, + "grad_norm": 0.883358895778656, + "learning_rate": 9.652626304442361e-05, + "loss": 0.0868, + "step": 29410 + }, + { + "epoch": 1.924762839385018, + "grad_norm": 1.020606279373169, + "learning_rate": 9.652289811218261e-05, + "loss": 0.094, + "step": 29420 + }, + { + "epoch": 1.9254170755642788, + "grad_norm": 0.7326846718788147, + "learning_rate": 9.65195316096718e-05, + "loss": 0.0892, + "step": 29430 + }, + { + "epoch": 1.9260713117435393, + "grad_norm": 1.0169782638549805, + "learning_rate": 9.651616353700479e-05, + "loss": 0.088, + "step": 29440 + }, + { + "epoch": 1.9267255479228003, + "grad_norm": 0.9561719298362732, + "learning_rate": 9.651279389429526e-05, + "loss": 0.0906, + "step": 29450 + }, + { + "epoch": 1.9273797841020608, + "grad_norm": 0.8069345355033875, + "learning_rate": 9.650942268165698e-05, + "loss": 0.1029, + "step": 29460 + }, + { + "epoch": 1.9280340202813215, + "grad_norm": 1.1819461584091187, + "learning_rate": 9.65060498992037e-05, + "loss": 0.0911, + "step": 29470 + }, + { + "epoch": 1.9286882564605823, + "grad_norm": 0.7691984176635742, + "learning_rate": 9.650267554704924e-05, + "loss": 0.0923, + "step": 29480 + }, + { + "epoch": 1.929342492639843, + "grad_norm": 0.8642717599868774, + "learning_rate": 9.649929962530756e-05, + "loss": 0.0843, + "step": 29490 + }, + { + "epoch": 1.9299967288191038, + "grad_norm": 0.7647484540939331, + "learning_rate": 9.649592213409253e-05, + "loss": 0.0902, + "step": 29500 + }, + { + "epoch": 1.9306509649983643, + "grad_norm": 0.7451311349868774, + "learning_rate": 9.64925430735182e-05, + "loss": 0.0902, + "step": 29510 + }, + { + "epoch": 1.9313052011776253, + "grad_norm": 0.820818305015564, + "learning_rate": 9.648916244369863e-05, + "loss": 0.0895, + "step": 29520 + }, + { + "epoch": 1.9319594373568858, + "grad_norm": 1.151523232460022, + "learning_rate": 9.648578024474789e-05, + "loss": 0.092, + "step": 29530 + }, + { + "epoch": 1.9326136735361465, + "grad_norm": 0.9633611440658569, + "learning_rate": 9.648239647678017e-05, + "loss": 0.0947, + "step": 29540 + }, + { + "epoch": 1.9332679097154073, + "grad_norm": 0.7115769386291504, + "learning_rate": 9.647901113990964e-05, + "loss": 0.0909, + "step": 29550 + }, + { + "epoch": 1.9339221458946678, + "grad_norm": 0.7603625655174255, + "learning_rate": 9.647562423425061e-05, + "loss": 0.0846, + "step": 29560 + }, + { + "epoch": 1.9345763820739288, + "grad_norm": 1.0014585256576538, + "learning_rate": 9.647223575991735e-05, + "loss": 0.0954, + "step": 29570 + }, + { + "epoch": 1.9352306182531893, + "grad_norm": 0.6401631832122803, + "learning_rate": 9.646884571702428e-05, + "loss": 0.0848, + "step": 29580 + }, + { + "epoch": 1.93588485443245, + "grad_norm": 0.8242216110229492, + "learning_rate": 9.64654541056858e-05, + "loss": 0.0834, + "step": 29590 + }, + { + "epoch": 1.9365390906117108, + "grad_norm": 0.9129480719566345, + "learning_rate": 9.646206092601636e-05, + "loss": 0.0794, + "step": 29600 + }, + { + "epoch": 1.9371933267909716, + "grad_norm": 0.8380915522575378, + "learning_rate": 9.645866617813053e-05, + "loss": 0.1019, + "step": 29610 + }, + { + "epoch": 1.9378475629702323, + "grad_norm": 0.7691987156867981, + "learning_rate": 9.645526986214286e-05, + "loss": 0.0872, + "step": 29620 + }, + { + "epoch": 1.9385017991494928, + "grad_norm": 0.9295036792755127, + "learning_rate": 9.6451871978168e-05, + "loss": 0.0978, + "step": 29630 + }, + { + "epoch": 1.9391560353287538, + "grad_norm": 0.8664805889129639, + "learning_rate": 9.644847252632065e-05, + "loss": 0.0909, + "step": 29640 + }, + { + "epoch": 1.9398102715080143, + "grad_norm": 0.9215870499610901, + "learning_rate": 9.644507150671554e-05, + "loss": 0.0918, + "step": 29650 + }, + { + "epoch": 1.940464507687275, + "grad_norm": 0.8502547740936279, + "learning_rate": 9.644166891946745e-05, + "loss": 0.086, + "step": 29660 + }, + { + "epoch": 1.9411187438665358, + "grad_norm": 0.7667042016983032, + "learning_rate": 9.643826476469124e-05, + "loss": 0.0816, + "step": 29670 + }, + { + "epoch": 1.9417729800457966, + "grad_norm": 0.8172445297241211, + "learning_rate": 9.64348590425018e-05, + "loss": 0.0926, + "step": 29680 + }, + { + "epoch": 1.9424272162250573, + "grad_norm": 0.8571412563323975, + "learning_rate": 9.643145175301409e-05, + "loss": 0.0836, + "step": 29690 + }, + { + "epoch": 1.9430814524043178, + "grad_norm": 0.8760371804237366, + "learning_rate": 9.642804289634311e-05, + "loss": 0.0851, + "step": 29700 + }, + { + "epoch": 1.9437356885835788, + "grad_norm": 0.8582215905189514, + "learning_rate": 9.642463247260391e-05, + "loss": 0.085, + "step": 29710 + }, + { + "epoch": 1.9443899247628393, + "grad_norm": 0.915457010269165, + "learning_rate": 9.642122048191164e-05, + "loss": 0.091, + "step": 29720 + }, + { + "epoch": 1.9450441609421, + "grad_norm": 0.8546823859214783, + "learning_rate": 9.641780692438142e-05, + "loss": 0.0907, + "step": 29730 + }, + { + "epoch": 1.9456983971213608, + "grad_norm": 0.9210653305053711, + "learning_rate": 9.641439180012848e-05, + "loss": 0.0893, + "step": 29740 + }, + { + "epoch": 1.9463526333006214, + "grad_norm": 0.7805274128913879, + "learning_rate": 9.641097510926809e-05, + "loss": 0.0863, + "step": 29750 + }, + { + "epoch": 1.9470068694798823, + "grad_norm": 0.8311102390289307, + "learning_rate": 9.640755685191556e-05, + "loss": 0.083, + "step": 29760 + }, + { + "epoch": 1.9476611056591429, + "grad_norm": 0.9258164763450623, + "learning_rate": 9.640413702818629e-05, + "loss": 0.0849, + "step": 29770 + }, + { + "epoch": 1.9483153418384038, + "grad_norm": 0.8838931918144226, + "learning_rate": 9.64007156381957e-05, + "loss": 0.0955, + "step": 29780 + }, + { + "epoch": 1.9489695780176644, + "grad_norm": 0.9353017210960388, + "learning_rate": 9.639729268205926e-05, + "loss": 0.0864, + "step": 29790 + }, + { + "epoch": 1.949623814196925, + "grad_norm": 0.8417748808860779, + "learning_rate": 9.639386815989252e-05, + "loss": 0.104, + "step": 29800 + }, + { + "epoch": 1.9502780503761858, + "grad_norm": 0.8374495506286621, + "learning_rate": 9.639044207181105e-05, + "loss": 0.1021, + "step": 29810 + }, + { + "epoch": 1.9509322865554464, + "grad_norm": 0.8973211646080017, + "learning_rate": 9.63870144179305e-05, + "loss": 0.1022, + "step": 29820 + }, + { + "epoch": 1.9515865227347073, + "grad_norm": 0.9601650834083557, + "learning_rate": 9.638358519836656e-05, + "loss": 0.1001, + "step": 29830 + }, + { + "epoch": 1.9522407589139679, + "grad_norm": 0.9467968344688416, + "learning_rate": 9.638015441323496e-05, + "loss": 0.0787, + "step": 29840 + }, + { + "epoch": 1.9528949950932286, + "grad_norm": 1.110438585281372, + "learning_rate": 9.637672206265152e-05, + "loss": 0.1028, + "step": 29850 + }, + { + "epoch": 1.9535492312724894, + "grad_norm": 1.1199315786361694, + "learning_rate": 9.63732881467321e-05, + "loss": 0.0869, + "step": 29860 + }, + { + "epoch": 1.95420346745175, + "grad_norm": 0.796136736869812, + "learning_rate": 9.636985266559258e-05, + "loss": 0.0975, + "step": 29870 + }, + { + "epoch": 1.9548577036310109, + "grad_norm": 0.8099967241287231, + "learning_rate": 9.63664156193489e-05, + "loss": 0.0837, + "step": 29880 + }, + { + "epoch": 1.9555119398102714, + "grad_norm": 0.7563862204551697, + "learning_rate": 9.636297700811712e-05, + "loss": 0.0844, + "step": 29890 + }, + { + "epoch": 1.9561661759895324, + "grad_norm": 1.2173089981079102, + "learning_rate": 9.635953683201325e-05, + "loss": 0.0896, + "step": 29900 + }, + { + "epoch": 1.9568204121687929, + "grad_norm": 0.7833836078643799, + "learning_rate": 9.635609509115344e-05, + "loss": 0.0898, + "step": 29910 + }, + { + "epoch": 1.9574746483480536, + "grad_norm": 0.6414048075675964, + "learning_rate": 9.635265178565385e-05, + "loss": 0.0914, + "step": 29920 + }, + { + "epoch": 1.9581288845273144, + "grad_norm": 0.7374199032783508, + "learning_rate": 9.63492069156307e-05, + "loss": 0.0874, + "step": 29930 + }, + { + "epoch": 1.9587831207065751, + "grad_norm": 1.0497411489486694, + "learning_rate": 9.634576048120027e-05, + "loss": 0.086, + "step": 29940 + }, + { + "epoch": 1.9594373568858359, + "grad_norm": 0.7760803699493408, + "learning_rate": 9.634231248247886e-05, + "loss": 0.091, + "step": 29950 + }, + { + "epoch": 1.9600915930650964, + "grad_norm": 0.7324857711791992, + "learning_rate": 9.633886291958287e-05, + "loss": 0.0913, + "step": 29960 + }, + { + "epoch": 1.9607458292443574, + "grad_norm": 0.9544631838798523, + "learning_rate": 9.633541179262874e-05, + "loss": 0.0936, + "step": 29970 + }, + { + "epoch": 1.9614000654236179, + "grad_norm": 0.8895130157470703, + "learning_rate": 9.633195910173294e-05, + "loss": 0.0883, + "step": 29980 + }, + { + "epoch": 1.9620543016028786, + "grad_norm": 0.8787074089050293, + "learning_rate": 9.632850484701199e-05, + "loss": 0.0989, + "step": 29990 + }, + { + "epoch": 1.9627085377821394, + "grad_norm": 0.9097030162811279, + "learning_rate": 9.632504902858253e-05, + "loss": 0.0864, + "step": 30000 + }, + { + "epoch": 1.9633627739614, + "grad_norm": 0.9169159531593323, + "learning_rate": 9.632159164656114e-05, + "loss": 0.101, + "step": 30010 + }, + { + "epoch": 1.9640170101406609, + "grad_norm": 0.8284802436828613, + "learning_rate": 9.631813270106458e-05, + "loss": 0.0929, + "step": 30020 + }, + { + "epoch": 1.9646712463199214, + "grad_norm": 0.69024258852005, + "learning_rate": 9.631467219220955e-05, + "loss": 0.0962, + "step": 30030 + }, + { + "epoch": 1.9653254824991822, + "grad_norm": 0.8356816172599792, + "learning_rate": 9.631121012011288e-05, + "loss": 0.0838, + "step": 30040 + }, + { + "epoch": 1.965979718678443, + "grad_norm": 0.8797650933265686, + "learning_rate": 9.630774648489141e-05, + "loss": 0.0877, + "step": 30050 + }, + { + "epoch": 1.9666339548577036, + "grad_norm": 0.9771427512168884, + "learning_rate": 9.630428128666204e-05, + "loss": 0.1115, + "step": 30060 + }, + { + "epoch": 1.9672881910369644, + "grad_norm": 0.791826605796814, + "learning_rate": 9.630081452554174e-05, + "loss": 0.0912, + "step": 30070 + }, + { + "epoch": 1.967942427216225, + "grad_norm": 0.7139357924461365, + "learning_rate": 9.629734620164753e-05, + "loss": 0.084, + "step": 30080 + }, + { + "epoch": 1.968596663395486, + "grad_norm": 0.9226008653640747, + "learning_rate": 9.629387631509646e-05, + "loss": 0.0954, + "step": 30090 + }, + { + "epoch": 1.9692508995747464, + "grad_norm": 0.9457941055297852, + "learning_rate": 9.629040486600567e-05, + "loss": 0.0924, + "step": 30100 + }, + { + "epoch": 1.9699051357540072, + "grad_norm": 0.6717526912689209, + "learning_rate": 9.628693185449228e-05, + "loss": 0.0829, + "step": 30110 + }, + { + "epoch": 1.970559371933268, + "grad_norm": 0.8272339105606079, + "learning_rate": 9.628345728067359e-05, + "loss": 0.0924, + "step": 30120 + }, + { + "epoch": 1.9712136081125287, + "grad_norm": 0.8791738748550415, + "learning_rate": 9.62799811446668e-05, + "loss": 0.0819, + "step": 30130 + }, + { + "epoch": 1.9718678442917894, + "grad_norm": 0.899770200252533, + "learning_rate": 9.627650344658929e-05, + "loss": 0.0892, + "step": 30140 + }, + { + "epoch": 1.97252208047105, + "grad_norm": 0.8326700329780579, + "learning_rate": 9.627302418655844e-05, + "loss": 0.0937, + "step": 30150 + }, + { + "epoch": 1.973176316650311, + "grad_norm": 0.9431321024894714, + "learning_rate": 9.626954336469166e-05, + "loss": 0.0902, + "step": 30160 + }, + { + "epoch": 1.9738305528295714, + "grad_norm": 0.9222663044929504, + "learning_rate": 9.626606098110643e-05, + "loss": 0.0893, + "step": 30170 + }, + { + "epoch": 1.9744847890088322, + "grad_norm": 0.7812402248382568, + "learning_rate": 9.62625770359203e-05, + "loss": 0.0853, + "step": 30180 + }, + { + "epoch": 1.975139025188093, + "grad_norm": 0.8364319801330566, + "learning_rate": 9.625909152925088e-05, + "loss": 0.0936, + "step": 30190 + }, + { + "epoch": 1.9757932613673534, + "grad_norm": 1.1527427434921265, + "learning_rate": 9.62556044612158e-05, + "loss": 0.0859, + "step": 30200 + }, + { + "epoch": 1.9764474975466144, + "grad_norm": 0.8224945068359375, + "learning_rate": 9.625211583193275e-05, + "loss": 0.0813, + "step": 30210 + }, + { + "epoch": 1.977101733725875, + "grad_norm": 0.81529700756073, + "learning_rate": 9.62486256415195e-05, + "loss": 0.0915, + "step": 30220 + }, + { + "epoch": 1.977755969905136, + "grad_norm": 0.839607834815979, + "learning_rate": 9.624513389009385e-05, + "loss": 0.09, + "step": 30230 + }, + { + "epoch": 1.9784102060843964, + "grad_norm": 0.7841370105743408, + "learning_rate": 9.624164057777363e-05, + "loss": 0.0921, + "step": 30240 + }, + { + "epoch": 1.9790644422636572, + "grad_norm": 0.8300483226776123, + "learning_rate": 9.623814570467678e-05, + "loss": 0.083, + "step": 30250 + }, + { + "epoch": 1.979718678442918, + "grad_norm": 0.7854914665222168, + "learning_rate": 9.623464927092123e-05, + "loss": 0.0886, + "step": 30260 + }, + { + "epoch": 1.9803729146221785, + "grad_norm": 0.7365754842758179, + "learning_rate": 9.623115127662504e-05, + "loss": 0.0808, + "step": 30270 + }, + { + "epoch": 1.9810271508014394, + "grad_norm": 0.8063216209411621, + "learning_rate": 9.622765172190624e-05, + "loss": 0.0885, + "step": 30280 + }, + { + "epoch": 1.9816813869807, + "grad_norm": 0.9440072774887085, + "learning_rate": 9.622415060688294e-05, + "loss": 0.0916, + "step": 30290 + }, + { + "epoch": 1.9823356231599607, + "grad_norm": 0.9911891222000122, + "learning_rate": 9.622064793167336e-05, + "loss": 0.0785, + "step": 30300 + }, + { + "epoch": 1.9829898593392215, + "grad_norm": 0.8785775303840637, + "learning_rate": 9.621714369639567e-05, + "loss": 0.0886, + "step": 30310 + }, + { + "epoch": 1.9836440955184822, + "grad_norm": 0.9227697849273682, + "learning_rate": 9.621363790116819e-05, + "loss": 0.0839, + "step": 30320 + }, + { + "epoch": 1.984298331697743, + "grad_norm": 1.0479682683944702, + "learning_rate": 9.621013054610922e-05, + "loss": 0.0892, + "step": 30330 + }, + { + "epoch": 1.9849525678770035, + "grad_norm": 0.957587718963623, + "learning_rate": 9.620662163133715e-05, + "loss": 0.0773, + "step": 30340 + }, + { + "epoch": 1.9856068040562644, + "grad_norm": 1.071298360824585, + "learning_rate": 9.620311115697043e-05, + "loss": 0.0865, + "step": 30350 + }, + { + "epoch": 1.986261040235525, + "grad_norm": 0.8113812208175659, + "learning_rate": 9.619959912312752e-05, + "loss": 0.0953, + "step": 30360 + }, + { + "epoch": 1.9869152764147857, + "grad_norm": 0.7876493334770203, + "learning_rate": 9.6196085529927e-05, + "loss": 0.0948, + "step": 30370 + }, + { + "epoch": 1.9875695125940465, + "grad_norm": 0.9819501638412476, + "learning_rate": 9.619257037748742e-05, + "loss": 0.0929, + "step": 30380 + }, + { + "epoch": 1.9882237487733072, + "grad_norm": 0.8558057546615601, + "learning_rate": 9.618905366592745e-05, + "loss": 0.0884, + "step": 30390 + }, + { + "epoch": 1.988877984952568, + "grad_norm": 0.7599206566810608, + "learning_rate": 9.618553539536579e-05, + "loss": 0.0896, + "step": 30400 + }, + { + "epoch": 1.9895322211318285, + "grad_norm": 0.8661050200462341, + "learning_rate": 9.618201556592117e-05, + "loss": 0.0754, + "step": 30410 + }, + { + "epoch": 1.9901864573110895, + "grad_norm": 0.7166074514389038, + "learning_rate": 9.617849417771244e-05, + "loss": 0.086, + "step": 30420 + }, + { + "epoch": 1.99084069349035, + "grad_norm": 0.7684576511383057, + "learning_rate": 9.61749712308584e-05, + "loss": 0.0905, + "step": 30430 + }, + { + "epoch": 1.9914949296696107, + "grad_norm": 0.8405929803848267, + "learning_rate": 9.617144672547798e-05, + "loss": 0.09, + "step": 30440 + }, + { + "epoch": 1.9921491658488715, + "grad_norm": 0.9315941333770752, + "learning_rate": 9.616792066169013e-05, + "loss": 0.091, + "step": 30450 + }, + { + "epoch": 1.992803402028132, + "grad_norm": 0.7858991622924805, + "learning_rate": 9.616439303961391e-05, + "loss": 0.095, + "step": 30460 + }, + { + "epoch": 1.993457638207393, + "grad_norm": 0.6836791038513184, + "learning_rate": 9.616086385936833e-05, + "loss": 0.0867, + "step": 30470 + }, + { + "epoch": 1.9941118743866535, + "grad_norm": 0.8010273575782776, + "learning_rate": 9.615733312107255e-05, + "loss": 0.0946, + "step": 30480 + }, + { + "epoch": 1.9947661105659142, + "grad_norm": 0.9536259770393372, + "learning_rate": 9.615380082484571e-05, + "loss": 0.0926, + "step": 30490 + }, + { + "epoch": 1.995420346745175, + "grad_norm": 0.880687952041626, + "learning_rate": 9.615026697080707e-05, + "loss": 0.1006, + "step": 30500 + }, + { + "epoch": 1.9960745829244357, + "grad_norm": 0.9787200689315796, + "learning_rate": 9.614673155907587e-05, + "loss": 0.0929, + "step": 30510 + }, + { + "epoch": 1.9967288191036965, + "grad_norm": 0.906243085861206, + "learning_rate": 9.614319458977145e-05, + "loss": 0.0931, + "step": 30520 + }, + { + "epoch": 1.997383055282957, + "grad_norm": 0.7984228730201721, + "learning_rate": 9.613965606301321e-05, + "loss": 0.0822, + "step": 30530 + }, + { + "epoch": 1.998037291462218, + "grad_norm": 0.8581556081771851, + "learning_rate": 9.613611597892059e-05, + "loss": 0.0877, + "step": 30540 + }, + { + "epoch": 1.9986915276414785, + "grad_norm": 0.724999725818634, + "learning_rate": 9.613257433761303e-05, + "loss": 0.0897, + "step": 30550 + }, + { + "epoch": 1.9993457638207393, + "grad_norm": 0.6740709543228149, + "learning_rate": 9.612903113921011e-05, + "loss": 0.0895, + "step": 30560 + }, + { + "epoch": 2.0, + "grad_norm": 0.8931834101676941, + "learning_rate": 9.612548638383141e-05, + "loss": 0.0857, + "step": 30570 + }, + { + "epoch": 2.0006542361792605, + "grad_norm": 0.8923904299736023, + "learning_rate": 9.612194007159657e-05, + "loss": 0.0848, + "step": 30580 + }, + { + "epoch": 2.0013084723585215, + "grad_norm": 0.8426147699356079, + "learning_rate": 9.61183922026253e-05, + "loss": 0.0931, + "step": 30590 + }, + { + "epoch": 2.001962708537782, + "grad_norm": 0.7947473526000977, + "learning_rate": 9.611484277703733e-05, + "loss": 0.0989, + "step": 30600 + }, + { + "epoch": 2.002616944717043, + "grad_norm": 0.8306055665016174, + "learning_rate": 9.61112917949525e-05, + "loss": 0.0867, + "step": 30610 + }, + { + "epoch": 2.0032711808963035, + "grad_norm": 0.9573779702186584, + "learning_rate": 9.610773925649062e-05, + "loss": 0.0943, + "step": 30620 + }, + { + "epoch": 2.0039254170755645, + "grad_norm": 0.8894043564796448, + "learning_rate": 9.610418516177164e-05, + "loss": 0.0891, + "step": 30630 + }, + { + "epoch": 2.004579653254825, + "grad_norm": 0.8106326460838318, + "learning_rate": 9.610062951091547e-05, + "loss": 0.0892, + "step": 30640 + }, + { + "epoch": 2.0052338894340855, + "grad_norm": 0.9291239976882935, + "learning_rate": 9.609707230404217e-05, + "loss": 0.0835, + "step": 30650 + }, + { + "epoch": 2.0058881256133465, + "grad_norm": 0.7362892627716064, + "learning_rate": 9.609351354127178e-05, + "loss": 0.0838, + "step": 30660 + }, + { + "epoch": 2.006542361792607, + "grad_norm": 0.7407713532447815, + "learning_rate": 9.608995322272442e-05, + "loss": 0.0951, + "step": 30670 + }, + { + "epoch": 2.007196597971868, + "grad_norm": 0.9148522019386292, + "learning_rate": 9.608639134852028e-05, + "loss": 0.0895, + "step": 30680 + }, + { + "epoch": 2.0078508341511285, + "grad_norm": 0.7129114866256714, + "learning_rate": 9.608282791877955e-05, + "loss": 0.0866, + "step": 30690 + }, + { + "epoch": 2.008505070330389, + "grad_norm": 1.1984944343566895, + "learning_rate": 9.607926293362253e-05, + "loss": 0.0941, + "step": 30700 + }, + { + "epoch": 2.00915930650965, + "grad_norm": 0.7906424403190613, + "learning_rate": 9.607569639316953e-05, + "loss": 0.0811, + "step": 30710 + }, + { + "epoch": 2.0098135426889105, + "grad_norm": 0.9424049258232117, + "learning_rate": 9.607212829754094e-05, + "loss": 0.0872, + "step": 30720 + }, + { + "epoch": 2.0104677788681715, + "grad_norm": 0.9077602028846741, + "learning_rate": 9.60685586468572e-05, + "loss": 0.0946, + "step": 30730 + }, + { + "epoch": 2.011122015047432, + "grad_norm": 0.867131769657135, + "learning_rate": 9.606498744123877e-05, + "loss": 0.0903, + "step": 30740 + }, + { + "epoch": 2.011776251226693, + "grad_norm": 0.7888831496238708, + "learning_rate": 9.606141468080623e-05, + "loss": 0.0829, + "step": 30750 + }, + { + "epoch": 2.0124304874059535, + "grad_norm": 1.2040573358535767, + "learning_rate": 9.605784036568011e-05, + "loss": 0.1066, + "step": 30760 + }, + { + "epoch": 2.013084723585214, + "grad_norm": 0.864673376083374, + "learning_rate": 9.605426449598112e-05, + "loss": 0.0929, + "step": 30770 + }, + { + "epoch": 2.013738959764475, + "grad_norm": 0.7464171648025513, + "learning_rate": 9.60506870718299e-05, + "loss": 0.0877, + "step": 30780 + }, + { + "epoch": 2.0143931959437356, + "grad_norm": 0.8971266150474548, + "learning_rate": 9.604710809334723e-05, + "loss": 0.096, + "step": 30790 + }, + { + "epoch": 2.0150474321229965, + "grad_norm": 0.9102427959442139, + "learning_rate": 9.60435275606539e-05, + "loss": 0.0975, + "step": 30800 + }, + { + "epoch": 2.015701668302257, + "grad_norm": 0.8453085422515869, + "learning_rate": 9.603994547387074e-05, + "loss": 0.0989, + "step": 30810 + }, + { + "epoch": 2.016355904481518, + "grad_norm": 0.9234716892242432, + "learning_rate": 9.603636183311871e-05, + "loss": 0.089, + "step": 30820 + }, + { + "epoch": 2.0170101406607785, + "grad_norm": 0.8139061331748962, + "learning_rate": 9.60327766385187e-05, + "loss": 0.0854, + "step": 30830 + }, + { + "epoch": 2.017664376840039, + "grad_norm": 0.8283985257148743, + "learning_rate": 9.602918989019176e-05, + "loss": 0.0881, + "step": 30840 + }, + { + "epoch": 2.0183186130193, + "grad_norm": 0.780783474445343, + "learning_rate": 9.602560158825896e-05, + "loss": 0.0888, + "step": 30850 + }, + { + "epoch": 2.0189728491985606, + "grad_norm": 0.6656304001808167, + "learning_rate": 9.602201173284139e-05, + "loss": 0.0792, + "step": 30860 + }, + { + "epoch": 2.0196270853778215, + "grad_norm": 0.7495284080505371, + "learning_rate": 9.601842032406023e-05, + "loss": 0.0935, + "step": 30870 + }, + { + "epoch": 2.020281321557082, + "grad_norm": 0.730228066444397, + "learning_rate": 9.60148273620367e-05, + "loss": 0.083, + "step": 30880 + }, + { + "epoch": 2.020935557736343, + "grad_norm": 0.8227654099464417, + "learning_rate": 9.601123284689206e-05, + "loss": 0.0904, + "step": 30890 + }, + { + "epoch": 2.0215897939156036, + "grad_norm": 0.8423411250114441, + "learning_rate": 9.600763677874764e-05, + "loss": 0.0854, + "step": 30900 + }, + { + "epoch": 2.022244030094864, + "grad_norm": 0.9482555985450745, + "learning_rate": 9.600403915772484e-05, + "loss": 0.0973, + "step": 30910 + }, + { + "epoch": 2.022898266274125, + "grad_norm": 0.7677321434020996, + "learning_rate": 9.600043998394506e-05, + "loss": 0.0847, + "step": 30920 + }, + { + "epoch": 2.0235525024533856, + "grad_norm": 0.7692193984985352, + "learning_rate": 9.599683925752979e-05, + "loss": 0.1019, + "step": 30930 + }, + { + "epoch": 2.0242067386326466, + "grad_norm": 0.870507001876831, + "learning_rate": 9.599323697860055e-05, + "loss": 0.0897, + "step": 30940 + }, + { + "epoch": 2.024860974811907, + "grad_norm": 0.933172345161438, + "learning_rate": 9.598963314727894e-05, + "loss": 0.0922, + "step": 30950 + }, + { + "epoch": 2.0255152109911676, + "grad_norm": 0.8484349250793457, + "learning_rate": 9.598602776368661e-05, + "loss": 0.0927, + "step": 30960 + }, + { + "epoch": 2.0261694471704286, + "grad_norm": 0.7965954542160034, + "learning_rate": 9.598242082794524e-05, + "loss": 0.0983, + "step": 30970 + }, + { + "epoch": 2.026823683349689, + "grad_norm": 0.7531421184539795, + "learning_rate": 9.597881234017657e-05, + "loss": 0.0841, + "step": 30980 + }, + { + "epoch": 2.02747791952895, + "grad_norm": 0.9377996325492859, + "learning_rate": 9.597520230050242e-05, + "loss": 0.1034, + "step": 30990 + }, + { + "epoch": 2.0281321557082106, + "grad_norm": 0.8710869550704956, + "learning_rate": 9.597159070904458e-05, + "loss": 0.0923, + "step": 31000 + }, + { + "epoch": 2.0287863918874716, + "grad_norm": 0.9062069058418274, + "learning_rate": 9.596797756592502e-05, + "loss": 0.0931, + "step": 31010 + }, + { + "epoch": 2.029440628066732, + "grad_norm": 1.0289934873580933, + "learning_rate": 9.596436287126565e-05, + "loss": 0.0977, + "step": 31020 + }, + { + "epoch": 2.0300948642459926, + "grad_norm": 0.9156734943389893, + "learning_rate": 9.596074662518848e-05, + "loss": 0.0874, + "step": 31030 + }, + { + "epoch": 2.0307491004252536, + "grad_norm": 0.9219240546226501, + "learning_rate": 9.595712882781558e-05, + "loss": 0.099, + "step": 31040 + }, + { + "epoch": 2.031403336604514, + "grad_norm": 0.9066693186759949, + "learning_rate": 9.595350947926907e-05, + "loss": 0.09, + "step": 31050 + }, + { + "epoch": 2.032057572783775, + "grad_norm": 0.8575799465179443, + "learning_rate": 9.594988857967106e-05, + "loss": 0.0895, + "step": 31060 + }, + { + "epoch": 2.0327118089630356, + "grad_norm": 0.977655291557312, + "learning_rate": 9.594626612914383e-05, + "loss": 0.0949, + "step": 31070 + }, + { + "epoch": 2.0333660451422966, + "grad_norm": 0.8753423094749451, + "learning_rate": 9.594264212780962e-05, + "loss": 0.0801, + "step": 31080 + }, + { + "epoch": 2.034020281321557, + "grad_norm": 0.7495356798171997, + "learning_rate": 9.593901657579075e-05, + "loss": 0.096, + "step": 31090 + }, + { + "epoch": 2.0346745175008176, + "grad_norm": 0.8719765543937683, + "learning_rate": 9.593538947320959e-05, + "loss": 0.0975, + "step": 31100 + }, + { + "epoch": 2.0353287536800786, + "grad_norm": 0.7684383988380432, + "learning_rate": 9.593176082018855e-05, + "loss": 0.0857, + "step": 31110 + }, + { + "epoch": 2.035982989859339, + "grad_norm": 1.1163846254348755, + "learning_rate": 9.592813061685015e-05, + "loss": 0.1057, + "step": 31120 + }, + { + "epoch": 2.0366372260386, + "grad_norm": 0.8532420992851257, + "learning_rate": 9.592449886331687e-05, + "loss": 0.0969, + "step": 31130 + }, + { + "epoch": 2.0372914622178606, + "grad_norm": 0.8979114890098572, + "learning_rate": 9.592086555971131e-05, + "loss": 0.0875, + "step": 31140 + }, + { + "epoch": 2.037945698397121, + "grad_norm": 0.8417396545410156, + "learning_rate": 9.591723070615612e-05, + "loss": 0.0906, + "step": 31150 + }, + { + "epoch": 2.038599934576382, + "grad_norm": 1.090818166732788, + "learning_rate": 9.591359430277396e-05, + "loss": 0.0861, + "step": 31160 + }, + { + "epoch": 2.0392541707556426, + "grad_norm": 0.6975811123847961, + "learning_rate": 9.590995634968759e-05, + "loss": 0.0902, + "step": 31170 + }, + { + "epoch": 2.0399084069349036, + "grad_norm": 0.996051549911499, + "learning_rate": 9.590631684701979e-05, + "loss": 0.0854, + "step": 31180 + }, + { + "epoch": 2.040562643114164, + "grad_norm": 0.9619652628898621, + "learning_rate": 9.590267579489338e-05, + "loss": 0.0975, + "step": 31190 + }, + { + "epoch": 2.041216879293425, + "grad_norm": 0.8321636319160461, + "learning_rate": 9.589903319343129e-05, + "loss": 0.0883, + "step": 31200 + }, + { + "epoch": 2.0418711154726856, + "grad_norm": 0.8303142189979553, + "learning_rate": 9.589538904275645e-05, + "loss": 0.086, + "step": 31210 + }, + { + "epoch": 2.042525351651946, + "grad_norm": 0.7571492195129395, + "learning_rate": 9.589174334299189e-05, + "loss": 0.0857, + "step": 31220 + }, + { + "epoch": 2.043179587831207, + "grad_norm": 0.6824153065681458, + "learning_rate": 9.588809609426061e-05, + "loss": 0.0876, + "step": 31230 + }, + { + "epoch": 2.0438338240104676, + "grad_norm": 0.907193660736084, + "learning_rate": 9.588444729668575e-05, + "loss": 0.0847, + "step": 31240 + }, + { + "epoch": 2.0444880601897286, + "grad_norm": 0.9690275192260742, + "learning_rate": 9.588079695039047e-05, + "loss": 0.096, + "step": 31250 + }, + { + "epoch": 2.045142296368989, + "grad_norm": 0.8058456182479858, + "learning_rate": 9.587714505549796e-05, + "loss": 0.0905, + "step": 31260 + }, + { + "epoch": 2.04579653254825, + "grad_norm": 0.9792216420173645, + "learning_rate": 9.587349161213148e-05, + "loss": 0.0956, + "step": 31270 + }, + { + "epoch": 2.0464507687275106, + "grad_norm": 0.8495068550109863, + "learning_rate": 9.586983662041434e-05, + "loss": 0.0937, + "step": 31280 + }, + { + "epoch": 2.047105004906771, + "grad_norm": 0.9974722862243652, + "learning_rate": 9.586618008046992e-05, + "loss": 0.086, + "step": 31290 + }, + { + "epoch": 2.047759241086032, + "grad_norm": 0.8604632616043091, + "learning_rate": 9.586252199242166e-05, + "loss": 0.0877, + "step": 31300 + }, + { + "epoch": 2.0484134772652927, + "grad_norm": 0.803986668586731, + "learning_rate": 9.585886235639299e-05, + "loss": 0.0971, + "step": 31310 + }, + { + "epoch": 2.0490677134445536, + "grad_norm": 0.6995072960853577, + "learning_rate": 9.585520117250744e-05, + "loss": 0.0965, + "step": 31320 + }, + { + "epoch": 2.049721949623814, + "grad_norm": 0.9633769989013672, + "learning_rate": 9.585153844088858e-05, + "loss": 0.0906, + "step": 31330 + }, + { + "epoch": 2.050376185803075, + "grad_norm": 0.7706478238105774, + "learning_rate": 9.584787416166006e-05, + "loss": 0.0924, + "step": 31340 + }, + { + "epoch": 2.0510304219823356, + "grad_norm": 0.7993813753128052, + "learning_rate": 9.584420833494555e-05, + "loss": 0.0945, + "step": 31350 + }, + { + "epoch": 2.051684658161596, + "grad_norm": 0.7742961645126343, + "learning_rate": 9.584054096086877e-05, + "loss": 0.0973, + "step": 31360 + }, + { + "epoch": 2.052338894340857, + "grad_norm": 0.7687771320343018, + "learning_rate": 9.583687203955352e-05, + "loss": 0.0907, + "step": 31370 + }, + { + "epoch": 2.0529931305201177, + "grad_norm": 0.7839317917823792, + "learning_rate": 9.583320157112362e-05, + "loss": 0.0888, + "step": 31380 + }, + { + "epoch": 2.0536473666993786, + "grad_norm": 0.891391396522522, + "learning_rate": 9.582952955570297e-05, + "loss": 0.0929, + "step": 31390 + }, + { + "epoch": 2.054301602878639, + "grad_norm": 0.699019730091095, + "learning_rate": 9.58258559934155e-05, + "loss": 0.0879, + "step": 31400 + }, + { + "epoch": 2.0549558390578997, + "grad_norm": 0.765687108039856, + "learning_rate": 9.582218088438522e-05, + "loss": 0.0901, + "step": 31410 + }, + { + "epoch": 2.0556100752371607, + "grad_norm": 0.7088882327079773, + "learning_rate": 9.581850422873615e-05, + "loss": 0.0878, + "step": 31420 + }, + { + "epoch": 2.056264311416421, + "grad_norm": 0.9518930315971375, + "learning_rate": 9.58148260265924e-05, + "loss": 0.0831, + "step": 31430 + }, + { + "epoch": 2.056918547595682, + "grad_norm": 0.6729696393013, + "learning_rate": 9.581114627807812e-05, + "loss": 0.0931, + "step": 31440 + }, + { + "epoch": 2.0575727837749427, + "grad_norm": 0.8468174338340759, + "learning_rate": 9.58074649833175e-05, + "loss": 0.0958, + "step": 31450 + }, + { + "epoch": 2.0582270199542037, + "grad_norm": 0.8482375144958496, + "learning_rate": 9.580378214243482e-05, + "loss": 0.09, + "step": 31460 + }, + { + "epoch": 2.058881256133464, + "grad_norm": 0.7801223397254944, + "learning_rate": 9.580009775555435e-05, + "loss": 0.0862, + "step": 31470 + }, + { + "epoch": 2.0595354923127247, + "grad_norm": 0.7654222249984741, + "learning_rate": 9.579641182280049e-05, + "loss": 0.0897, + "step": 31480 + }, + { + "epoch": 2.0601897284919857, + "grad_norm": 0.9105095267295837, + "learning_rate": 9.57927243442976e-05, + "loss": 0.0818, + "step": 31490 + }, + { + "epoch": 2.060843964671246, + "grad_norm": 0.8952615261077881, + "learning_rate": 9.578903532017017e-05, + "loss": 0.0898, + "step": 31500 + }, + { + "epoch": 2.061498200850507, + "grad_norm": 0.9962287545204163, + "learning_rate": 9.578534475054272e-05, + "loss": 0.0912, + "step": 31510 + }, + { + "epoch": 2.0621524370297677, + "grad_norm": 0.9723460674285889, + "learning_rate": 9.578165263553982e-05, + "loss": 0.0816, + "step": 31520 + }, + { + "epoch": 2.0628066732090287, + "grad_norm": 0.8590718507766724, + "learning_rate": 9.577795897528605e-05, + "loss": 0.0866, + "step": 31530 + }, + { + "epoch": 2.063460909388289, + "grad_norm": 0.8612424731254578, + "learning_rate": 9.577426376990613e-05, + "loss": 0.0951, + "step": 31540 + }, + { + "epoch": 2.0641151455675497, + "grad_norm": 0.8139830231666565, + "learning_rate": 9.577056701952474e-05, + "loss": 0.0836, + "step": 31550 + }, + { + "epoch": 2.0647693817468107, + "grad_norm": 0.7661455869674683, + "learning_rate": 9.57668687242667e-05, + "loss": 0.0822, + "step": 31560 + }, + { + "epoch": 2.065423617926071, + "grad_norm": 0.7846705317497253, + "learning_rate": 9.576316888425681e-05, + "loss": 0.089, + "step": 31570 + }, + { + "epoch": 2.066077854105332, + "grad_norm": 0.9100658893585205, + "learning_rate": 9.575946749961992e-05, + "loss": 0.078, + "step": 31580 + }, + { + "epoch": 2.0667320902845927, + "grad_norm": 0.8876975178718567, + "learning_rate": 9.575576457048102e-05, + "loss": 0.0923, + "step": 31590 + }, + { + "epoch": 2.0673863264638532, + "grad_norm": 0.9435741901397705, + "learning_rate": 9.575206009696507e-05, + "loss": 0.0967, + "step": 31600 + }, + { + "epoch": 2.068040562643114, + "grad_norm": 0.7191019058227539, + "learning_rate": 9.574835407919709e-05, + "loss": 0.0965, + "step": 31610 + }, + { + "epoch": 2.0686947988223747, + "grad_norm": 0.7950155735015869, + "learning_rate": 9.574464651730219e-05, + "loss": 0.0873, + "step": 31620 + }, + { + "epoch": 2.0693490350016357, + "grad_norm": 0.7821862697601318, + "learning_rate": 9.574093741140549e-05, + "loss": 0.0901, + "step": 31630 + }, + { + "epoch": 2.070003271180896, + "grad_norm": 0.7659751176834106, + "learning_rate": 9.57372267616322e-05, + "loss": 0.0866, + "step": 31640 + }, + { + "epoch": 2.070657507360157, + "grad_norm": 0.8592097759246826, + "learning_rate": 9.573351456810755e-05, + "loss": 0.0871, + "step": 31650 + }, + { + "epoch": 2.0713117435394177, + "grad_norm": 0.9904183149337769, + "learning_rate": 9.572980083095684e-05, + "loss": 0.0931, + "step": 31660 + }, + { + "epoch": 2.0719659797186782, + "grad_norm": 0.8140203952789307, + "learning_rate": 9.572608555030543e-05, + "loss": 0.0921, + "step": 31670 + }, + { + "epoch": 2.072620215897939, + "grad_norm": 1.019144058227539, + "learning_rate": 9.57223687262787e-05, + "loss": 0.0873, + "step": 31680 + }, + { + "epoch": 2.0732744520771997, + "grad_norm": 0.8662041425704956, + "learning_rate": 9.571865035900213e-05, + "loss": 0.0883, + "step": 31690 + }, + { + "epoch": 2.0739286882564607, + "grad_norm": 0.7000994682312012, + "learning_rate": 9.571493044860121e-05, + "loss": 0.0819, + "step": 31700 + }, + { + "epoch": 2.0745829244357212, + "grad_norm": 0.9109962582588196, + "learning_rate": 9.571120899520148e-05, + "loss": 0.0957, + "step": 31710 + }, + { + "epoch": 2.075237160614982, + "grad_norm": 0.7647160887718201, + "learning_rate": 9.570748599892858e-05, + "loss": 0.0833, + "step": 31720 + }, + { + "epoch": 2.0758913967942427, + "grad_norm": 0.8084906339645386, + "learning_rate": 9.570376145990814e-05, + "loss": 0.086, + "step": 31730 + }, + { + "epoch": 2.0765456329735033, + "grad_norm": 0.9867994785308838, + "learning_rate": 9.57000353782659e-05, + "loss": 0.0799, + "step": 31740 + }, + { + "epoch": 2.0771998691527642, + "grad_norm": 0.9291912317276001, + "learning_rate": 9.569630775412762e-05, + "loss": 0.0904, + "step": 31750 + }, + { + "epoch": 2.0778541053320247, + "grad_norm": 0.9430752992630005, + "learning_rate": 9.569257858761909e-05, + "loss": 0.0868, + "step": 31760 + }, + { + "epoch": 2.0785083415112857, + "grad_norm": 0.7106003165245056, + "learning_rate": 9.568884787886621e-05, + "loss": 0.0909, + "step": 31770 + }, + { + "epoch": 2.0791625776905462, + "grad_norm": 1.0632734298706055, + "learning_rate": 9.56851156279949e-05, + "loss": 0.0883, + "step": 31780 + }, + { + "epoch": 2.079816813869807, + "grad_norm": 0.8719366192817688, + "learning_rate": 9.568138183513111e-05, + "loss": 0.0842, + "step": 31790 + }, + { + "epoch": 2.0804710500490677, + "grad_norm": 0.806563675403595, + "learning_rate": 9.567764650040087e-05, + "loss": 0.0993, + "step": 31800 + }, + { + "epoch": 2.0811252862283283, + "grad_norm": 0.8879052996635437, + "learning_rate": 9.567390962393029e-05, + "loss": 0.0953, + "step": 31810 + }, + { + "epoch": 2.0817795224075892, + "grad_norm": 0.8709869384765625, + "learning_rate": 9.567017120584545e-05, + "loss": 0.087, + "step": 31820 + }, + { + "epoch": 2.0824337585868498, + "grad_norm": 0.7416978478431702, + "learning_rate": 9.566643124627258e-05, + "loss": 0.0861, + "step": 31830 + }, + { + "epoch": 2.0830879947661107, + "grad_norm": 0.8955287337303162, + "learning_rate": 9.566268974533789e-05, + "loss": 0.0933, + "step": 31840 + }, + { + "epoch": 2.0837422309453713, + "grad_norm": 0.8596556186676025, + "learning_rate": 9.565894670316767e-05, + "loss": 0.0911, + "step": 31850 + }, + { + "epoch": 2.084396467124632, + "grad_norm": 0.8234604597091675, + "learning_rate": 9.565520211988823e-05, + "loss": 0.0864, + "step": 31860 + }, + { + "epoch": 2.0850507033038927, + "grad_norm": 0.862963855266571, + "learning_rate": 9.5651455995626e-05, + "loss": 0.0942, + "step": 31870 + }, + { + "epoch": 2.0857049394831533, + "grad_norm": 0.8025641441345215, + "learning_rate": 9.56477083305074e-05, + "loss": 0.0871, + "step": 31880 + }, + { + "epoch": 2.0863591756624142, + "grad_norm": 0.7878461480140686, + "learning_rate": 9.564395912465893e-05, + "loss": 0.0889, + "step": 31890 + }, + { + "epoch": 2.0870134118416748, + "grad_norm": 0.8542391061782837, + "learning_rate": 9.564020837820713e-05, + "loss": 0.0936, + "step": 31900 + }, + { + "epoch": 2.0876676480209357, + "grad_norm": 0.7547128796577454, + "learning_rate": 9.56364560912786e-05, + "loss": 0.0922, + "step": 31910 + }, + { + "epoch": 2.0883218842001963, + "grad_norm": 0.7220647931098938, + "learning_rate": 9.563270226400001e-05, + "loss": 0.0832, + "step": 31920 + }, + { + "epoch": 2.088976120379457, + "grad_norm": 0.9882017374038696, + "learning_rate": 9.562894689649802e-05, + "loss": 0.0984, + "step": 31930 + }, + { + "epoch": 2.0896303565587178, + "grad_norm": 0.949703574180603, + "learning_rate": 9.562518998889942e-05, + "loss": 0.0903, + "step": 31940 + }, + { + "epoch": 2.0902845927379783, + "grad_norm": 1.0962672233581543, + "learning_rate": 9.562143154133099e-05, + "loss": 0.0848, + "step": 31950 + }, + { + "epoch": 2.0909388289172393, + "grad_norm": 0.9452337622642517, + "learning_rate": 9.561767155391961e-05, + "loss": 0.0901, + "step": 31960 + }, + { + "epoch": 2.0915930650965, + "grad_norm": 0.9834058880805969, + "learning_rate": 9.561391002679217e-05, + "loss": 0.0849, + "step": 31970 + }, + { + "epoch": 2.0922473012757608, + "grad_norm": 0.8670421838760376, + "learning_rate": 9.561014696007565e-05, + "loss": 0.0926, + "step": 31980 + }, + { + "epoch": 2.0929015374550213, + "grad_norm": 0.8581560254096985, + "learning_rate": 9.560638235389704e-05, + "loss": 0.0923, + "step": 31990 + }, + { + "epoch": 2.093555773634282, + "grad_norm": 0.9156831502914429, + "learning_rate": 9.560261620838342e-05, + "loss": 0.0951, + "step": 32000 + }, + { + "epoch": 2.0942100098135428, + "grad_norm": 0.6964516639709473, + "learning_rate": 9.559884852366191e-05, + "loss": 0.0958, + "step": 32010 + }, + { + "epoch": 2.0948642459928033, + "grad_norm": 0.859331488609314, + "learning_rate": 9.559507929985968e-05, + "loss": 0.0784, + "step": 32020 + }, + { + "epoch": 2.0955184821720643, + "grad_norm": 0.8123635649681091, + "learning_rate": 9.559130853710395e-05, + "loss": 0.0899, + "step": 32030 + }, + { + "epoch": 2.096172718351325, + "grad_norm": 0.7182052731513977, + "learning_rate": 9.558753623552197e-05, + "loss": 0.0874, + "step": 32040 + }, + { + "epoch": 2.0968269545305853, + "grad_norm": 0.8559288382530212, + "learning_rate": 9.558376239524109e-05, + "loss": 0.0788, + "step": 32050 + }, + { + "epoch": 2.0974811907098463, + "grad_norm": 0.8461546301841736, + "learning_rate": 9.557998701638868e-05, + "loss": 0.0858, + "step": 32060 + }, + { + "epoch": 2.098135426889107, + "grad_norm": 0.8831964135169983, + "learning_rate": 9.557621009909218e-05, + "loss": 0.1007, + "step": 32070 + }, + { + "epoch": 2.098789663068368, + "grad_norm": 0.8673757910728455, + "learning_rate": 9.557243164347907e-05, + "loss": 0.0876, + "step": 32080 + }, + { + "epoch": 2.0994438992476283, + "grad_norm": 0.9492565393447876, + "learning_rate": 9.556865164967685e-05, + "loss": 0.0919, + "step": 32090 + }, + { + "epoch": 2.1000981354268893, + "grad_norm": 0.8999154567718506, + "learning_rate": 9.556487011781314e-05, + "loss": 0.087, + "step": 32100 + }, + { + "epoch": 2.10075237160615, + "grad_norm": 0.7991196513175964, + "learning_rate": 9.556108704801558e-05, + "loss": 0.0878, + "step": 32110 + }, + { + "epoch": 2.1014066077854103, + "grad_norm": 0.8117789030075073, + "learning_rate": 9.555730244041182e-05, + "loss": 0.0961, + "step": 32120 + }, + { + "epoch": 2.1020608439646713, + "grad_norm": 0.8746902346611023, + "learning_rate": 9.555351629512963e-05, + "loss": 0.0885, + "step": 32130 + }, + { + "epoch": 2.102715080143932, + "grad_norm": 1.0785053968429565, + "learning_rate": 9.55497286122968e-05, + "loss": 0.0928, + "step": 32140 + }, + { + "epoch": 2.103369316323193, + "grad_norm": 0.9100284576416016, + "learning_rate": 9.554593939204117e-05, + "loss": 0.0856, + "step": 32150 + }, + { + "epoch": 2.1040235525024533, + "grad_norm": 0.7973883748054504, + "learning_rate": 9.554214863449065e-05, + "loss": 0.0826, + "step": 32160 + }, + { + "epoch": 2.1046777886817143, + "grad_norm": 0.9235144257545471, + "learning_rate": 9.553835633977316e-05, + "loss": 0.0876, + "step": 32170 + }, + { + "epoch": 2.105332024860975, + "grad_norm": 0.7327439785003662, + "learning_rate": 9.553456250801671e-05, + "loss": 0.0962, + "step": 32180 + }, + { + "epoch": 2.1059862610402353, + "grad_norm": 0.7771127223968506, + "learning_rate": 9.553076713934936e-05, + "loss": 0.0884, + "step": 32190 + }, + { + "epoch": 2.1066404972194963, + "grad_norm": 0.8690281510353088, + "learning_rate": 9.552697023389922e-05, + "loss": 0.0966, + "step": 32200 + }, + { + "epoch": 2.107294733398757, + "grad_norm": 0.9444182515144348, + "learning_rate": 9.552317179179444e-05, + "loss": 0.1007, + "step": 32210 + }, + { + "epoch": 2.107948969578018, + "grad_norm": 0.7808245420455933, + "learning_rate": 9.551937181316322e-05, + "loss": 0.0893, + "step": 32220 + }, + { + "epoch": 2.1086032057572783, + "grad_norm": 0.830314576625824, + "learning_rate": 9.55155702981338e-05, + "loss": 0.104, + "step": 32230 + }, + { + "epoch": 2.1092574419365393, + "grad_norm": 0.7720317244529724, + "learning_rate": 9.551176724683453e-05, + "loss": 0.0856, + "step": 32240 + }, + { + "epoch": 2.1099116781158, + "grad_norm": 0.8897857666015625, + "learning_rate": 9.550796265939377e-05, + "loss": 0.0877, + "step": 32250 + }, + { + "epoch": 2.1105659142950604, + "grad_norm": 0.8322727084159851, + "learning_rate": 9.550415653593989e-05, + "loss": 0.0817, + "step": 32260 + }, + { + "epoch": 2.1112201504743213, + "grad_norm": 0.8915346264839172, + "learning_rate": 9.550034887660143e-05, + "loss": 0.0801, + "step": 32270 + }, + { + "epoch": 2.111874386653582, + "grad_norm": 0.859603762626648, + "learning_rate": 9.549653968150682e-05, + "loss": 0.0939, + "step": 32280 + }, + { + "epoch": 2.112528622832843, + "grad_norm": 0.7257876396179199, + "learning_rate": 9.54927289507847e-05, + "loss": 0.0931, + "step": 32290 + }, + { + "epoch": 2.1131828590121033, + "grad_norm": 1.2223953008651733, + "learning_rate": 9.548891668456367e-05, + "loss": 0.1026, + "step": 32300 + }, + { + "epoch": 2.113837095191364, + "grad_norm": 0.8096550107002258, + "learning_rate": 9.54851028829724e-05, + "loss": 0.0877, + "step": 32310 + }, + { + "epoch": 2.114491331370625, + "grad_norm": 0.9068686366081238, + "learning_rate": 9.548128754613963e-05, + "loss": 0.1137, + "step": 32320 + }, + { + "epoch": 2.1151455675498854, + "grad_norm": 0.8379555940628052, + "learning_rate": 9.54774706741941e-05, + "loss": 0.0789, + "step": 32330 + }, + { + "epoch": 2.1157998037291463, + "grad_norm": 1.188866376876831, + "learning_rate": 9.547365226726468e-05, + "loss": 0.0919, + "step": 32340 + }, + { + "epoch": 2.116454039908407, + "grad_norm": 0.776324450969696, + "learning_rate": 9.546983232548023e-05, + "loss": 0.0906, + "step": 32350 + }, + { + "epoch": 2.117108276087668, + "grad_norm": 0.7669685482978821, + "learning_rate": 9.546601084896971e-05, + "loss": 0.0905, + "step": 32360 + }, + { + "epoch": 2.1177625122669284, + "grad_norm": 0.8013961911201477, + "learning_rate": 9.546218783786207e-05, + "loss": 0.091, + "step": 32370 + }, + { + "epoch": 2.118416748446189, + "grad_norm": 0.8074659705162048, + "learning_rate": 9.545836329228637e-05, + "loss": 0.0855, + "step": 32380 + }, + { + "epoch": 2.11907098462545, + "grad_norm": 0.8233710527420044, + "learning_rate": 9.545453721237167e-05, + "loss": 0.0879, + "step": 32390 + }, + { + "epoch": 2.1197252208047104, + "grad_norm": 0.8255857229232788, + "learning_rate": 9.545070959824716e-05, + "loss": 0.0876, + "step": 32400 + }, + { + "epoch": 2.1203794569839713, + "grad_norm": 0.8304885625839233, + "learning_rate": 9.544688045004197e-05, + "loss": 0.0903, + "step": 32410 + }, + { + "epoch": 2.121033693163232, + "grad_norm": 0.9255449175834656, + "learning_rate": 9.544304976788541e-05, + "loss": 0.0913, + "step": 32420 + }, + { + "epoch": 2.121687929342493, + "grad_norm": 0.8845074772834778, + "learning_rate": 9.543921755190671e-05, + "loss": 0.0849, + "step": 32430 + }, + { + "epoch": 2.1223421655217534, + "grad_norm": 0.9310880303382874, + "learning_rate": 9.543538380223527e-05, + "loss": 0.0988, + "step": 32440 + }, + { + "epoch": 2.122996401701014, + "grad_norm": 0.953241765499115, + "learning_rate": 9.543154851900045e-05, + "loss": 0.0909, + "step": 32450 + }, + { + "epoch": 2.123650637880275, + "grad_norm": 0.8089374303817749, + "learning_rate": 9.542771170233173e-05, + "loss": 0.0877, + "step": 32460 + }, + { + "epoch": 2.1243048740595354, + "grad_norm": 0.8584646582603455, + "learning_rate": 9.542387335235861e-05, + "loss": 0.0873, + "step": 32470 + }, + { + "epoch": 2.1249591102387964, + "grad_norm": 0.7349388599395752, + "learning_rate": 9.542003346921063e-05, + "loss": 0.0909, + "step": 32480 + }, + { + "epoch": 2.125613346418057, + "grad_norm": 0.9695256352424622, + "learning_rate": 9.541619205301739e-05, + "loss": 0.102, + "step": 32490 + }, + { + "epoch": 2.1262675825973174, + "grad_norm": 0.6882131695747375, + "learning_rate": 9.541234910390857e-05, + "loss": 0.0823, + "step": 32500 + }, + { + "epoch": 2.1269218187765784, + "grad_norm": 0.8628367185592651, + "learning_rate": 9.540850462201387e-05, + "loss": 0.0998, + "step": 32510 + }, + { + "epoch": 2.127576054955839, + "grad_norm": 0.7853913903236389, + "learning_rate": 9.540465860746305e-05, + "loss": 0.0905, + "step": 32520 + }, + { + "epoch": 2.1282302911351, + "grad_norm": 0.9766311049461365, + "learning_rate": 9.540081106038591e-05, + "loss": 0.0928, + "step": 32530 + }, + { + "epoch": 2.1288845273143604, + "grad_norm": 0.7391847372055054, + "learning_rate": 9.539696198091235e-05, + "loss": 0.084, + "step": 32540 + }, + { + "epoch": 2.1295387634936214, + "grad_norm": 0.9374977946281433, + "learning_rate": 9.539311136917227e-05, + "loss": 0.0979, + "step": 32550 + }, + { + "epoch": 2.130192999672882, + "grad_norm": 0.8064951300621033, + "learning_rate": 9.53892592252956e-05, + "loss": 0.0788, + "step": 32560 + }, + { + "epoch": 2.1308472358521424, + "grad_norm": 0.8344107866287231, + "learning_rate": 9.538540554941242e-05, + "loss": 0.0878, + "step": 32570 + }, + { + "epoch": 2.1315014720314034, + "grad_norm": 0.8620904684066772, + "learning_rate": 9.538155034165277e-05, + "loss": 0.0801, + "step": 32580 + }, + { + "epoch": 2.132155708210664, + "grad_norm": 0.764388918876648, + "learning_rate": 9.537769360214678e-05, + "loss": 0.0829, + "step": 32590 + }, + { + "epoch": 2.132809944389925, + "grad_norm": 0.7205486297607422, + "learning_rate": 9.537383533102462e-05, + "loss": 0.0815, + "step": 32600 + }, + { + "epoch": 2.1334641805691854, + "grad_norm": 0.7179217338562012, + "learning_rate": 9.536997552841653e-05, + "loss": 0.097, + "step": 32610 + }, + { + "epoch": 2.1341184167484464, + "grad_norm": 0.8533260226249695, + "learning_rate": 9.536611419445276e-05, + "loss": 0.0957, + "step": 32620 + }, + { + "epoch": 2.134772652927707, + "grad_norm": 0.9501368403434753, + "learning_rate": 9.536225132926366e-05, + "loss": 0.0954, + "step": 32630 + }, + { + "epoch": 2.1354268891069674, + "grad_norm": 0.8990276455879211, + "learning_rate": 9.535838693297963e-05, + "loss": 0.0895, + "step": 32640 + }, + { + "epoch": 2.1360811252862284, + "grad_norm": 0.9564955830574036, + "learning_rate": 9.535452100573108e-05, + "loss": 0.0933, + "step": 32650 + }, + { + "epoch": 2.136735361465489, + "grad_norm": 0.9729709625244141, + "learning_rate": 9.53506535476485e-05, + "loss": 0.1018, + "step": 32660 + }, + { + "epoch": 2.13738959764475, + "grad_norm": 0.8804893493652344, + "learning_rate": 9.534678455886241e-05, + "loss": 0.0903, + "step": 32670 + }, + { + "epoch": 2.1380438338240104, + "grad_norm": 0.8837690353393555, + "learning_rate": 9.534291403950341e-05, + "loss": 0.0917, + "step": 32680 + }, + { + "epoch": 2.1386980700032714, + "grad_norm": 0.9111039042472839, + "learning_rate": 9.533904198970218e-05, + "loss": 0.0803, + "step": 32690 + }, + { + "epoch": 2.139352306182532, + "grad_norm": 0.7728859186172485, + "learning_rate": 9.533516840958934e-05, + "loss": 0.081, + "step": 32700 + }, + { + "epoch": 2.1400065423617924, + "grad_norm": 0.8555951118469238, + "learning_rate": 9.533129329929568e-05, + "loss": 0.0857, + "step": 32710 + }, + { + "epoch": 2.1406607785410534, + "grad_norm": 0.761581540107727, + "learning_rate": 9.5327416658952e-05, + "loss": 0.0819, + "step": 32720 + }, + { + "epoch": 2.141315014720314, + "grad_norm": 0.7355059385299683, + "learning_rate": 9.532353848868914e-05, + "loss": 0.0839, + "step": 32730 + }, + { + "epoch": 2.141969250899575, + "grad_norm": 0.8941879868507385, + "learning_rate": 9.531965878863797e-05, + "loss": 0.0905, + "step": 32740 + }, + { + "epoch": 2.1426234870788354, + "grad_norm": 0.7693347334861755, + "learning_rate": 9.531577755892947e-05, + "loss": 0.0845, + "step": 32750 + }, + { + "epoch": 2.1432777232580964, + "grad_norm": 0.8309009671211243, + "learning_rate": 9.531189479969462e-05, + "loss": 0.0811, + "step": 32760 + }, + { + "epoch": 2.143931959437357, + "grad_norm": 0.8143077492713928, + "learning_rate": 9.530801051106449e-05, + "loss": 0.0857, + "step": 32770 + }, + { + "epoch": 2.1445861956166175, + "grad_norm": 0.9192956686019897, + "learning_rate": 9.53041246931702e-05, + "loss": 0.0907, + "step": 32780 + }, + { + "epoch": 2.1452404317958784, + "grad_norm": 0.9745957255363464, + "learning_rate": 9.530023734614286e-05, + "loss": 0.0898, + "step": 32790 + }, + { + "epoch": 2.145894667975139, + "grad_norm": 1.038955569267273, + "learning_rate": 9.529634847011373e-05, + "loss": 0.0901, + "step": 32800 + }, + { + "epoch": 2.1465489041544, + "grad_norm": 0.9316650032997131, + "learning_rate": 9.529245806521402e-05, + "loss": 0.078, + "step": 32810 + }, + { + "epoch": 2.1472031403336604, + "grad_norm": 0.9526748061180115, + "learning_rate": 9.528856613157509e-05, + "loss": 0.0921, + "step": 32820 + }, + { + "epoch": 2.147857376512921, + "grad_norm": 0.9869240522384644, + "learning_rate": 9.528467266932826e-05, + "loss": 0.0872, + "step": 32830 + }, + { + "epoch": 2.148511612692182, + "grad_norm": 0.9078952670097351, + "learning_rate": 9.528077767860497e-05, + "loss": 0.0867, + "step": 32840 + }, + { + "epoch": 2.1491658488714425, + "grad_norm": 1.0087946653366089, + "learning_rate": 9.527688115953668e-05, + "loss": 0.0835, + "step": 32850 + }, + { + "epoch": 2.1498200850507034, + "grad_norm": 0.8324182629585266, + "learning_rate": 9.527298311225493e-05, + "loss": 0.084, + "step": 32860 + }, + { + "epoch": 2.150474321229964, + "grad_norm": 0.7914925217628479, + "learning_rate": 9.526908353689123e-05, + "loss": 0.09, + "step": 32870 + }, + { + "epoch": 2.151128557409225, + "grad_norm": 0.8204302787780762, + "learning_rate": 9.526518243357725e-05, + "loss": 0.0969, + "step": 32880 + }, + { + "epoch": 2.1517827935884855, + "grad_norm": 0.8147171139717102, + "learning_rate": 9.526127980244466e-05, + "loss": 0.0867, + "step": 32890 + }, + { + "epoch": 2.152437029767746, + "grad_norm": 0.8153404593467712, + "learning_rate": 9.525737564362517e-05, + "loss": 0.078, + "step": 32900 + }, + { + "epoch": 2.153091265947007, + "grad_norm": 0.9826894402503967, + "learning_rate": 9.525346995725057e-05, + "loss": 0.0961, + "step": 32910 + }, + { + "epoch": 2.1537455021262675, + "grad_norm": 0.911864697933197, + "learning_rate": 9.524956274345268e-05, + "loss": 0.0884, + "step": 32920 + }, + { + "epoch": 2.1543997383055284, + "grad_norm": 0.9221819639205933, + "learning_rate": 9.524565400236335e-05, + "loss": 0.0984, + "step": 32930 + }, + { + "epoch": 2.155053974484789, + "grad_norm": 0.8833640217781067, + "learning_rate": 9.524174373411456e-05, + "loss": 0.0844, + "step": 32940 + }, + { + "epoch": 2.1557082106640495, + "grad_norm": 0.9329414963722229, + "learning_rate": 9.523783193883825e-05, + "loss": 0.0762, + "step": 32950 + }, + { + "epoch": 2.1563624468433105, + "grad_norm": 0.8548219799995422, + "learning_rate": 9.523391861666649e-05, + "loss": 0.0892, + "step": 32960 + }, + { + "epoch": 2.157016683022571, + "grad_norm": 0.9100401401519775, + "learning_rate": 9.523000376773132e-05, + "loss": 0.0935, + "step": 32970 + }, + { + "epoch": 2.157670919201832, + "grad_norm": 0.8718044757843018, + "learning_rate": 9.522608739216493e-05, + "loss": 0.0816, + "step": 32980 + }, + { + "epoch": 2.1583251553810925, + "grad_norm": 0.8366650938987732, + "learning_rate": 9.522216949009946e-05, + "loss": 0.079, + "step": 32990 + }, + { + "epoch": 2.1589793915603535, + "grad_norm": 0.9176976084709167, + "learning_rate": 9.52182500616672e-05, + "loss": 0.089, + "step": 33000 + }, + { + "epoch": 2.159633627739614, + "grad_norm": 1.1271123886108398, + "learning_rate": 9.521432910700039e-05, + "loss": 0.0919, + "step": 33010 + }, + { + "epoch": 2.1602878639188745, + "grad_norm": 0.935555100440979, + "learning_rate": 9.521040662623139e-05, + "loss": 0.0968, + "step": 33020 + }, + { + "epoch": 2.1609421000981355, + "grad_norm": 0.7743455767631531, + "learning_rate": 9.52064826194926e-05, + "loss": 0.0987, + "step": 33030 + }, + { + "epoch": 2.161596336277396, + "grad_norm": 0.7663136720657349, + "learning_rate": 9.520255708691646e-05, + "loss": 0.0865, + "step": 33040 + }, + { + "epoch": 2.162250572456657, + "grad_norm": 0.851262092590332, + "learning_rate": 9.519863002863548e-05, + "loss": 0.0805, + "step": 33050 + }, + { + "epoch": 2.1629048086359175, + "grad_norm": 1.051383137702942, + "learning_rate": 9.519470144478219e-05, + "loss": 0.099, + "step": 33060 + }, + { + "epoch": 2.1635590448151785, + "grad_norm": 0.7119585871696472, + "learning_rate": 9.519077133548922e-05, + "loss": 0.0792, + "step": 33070 + }, + { + "epoch": 2.164213280994439, + "grad_norm": 0.7884335517883301, + "learning_rate": 9.518683970088918e-05, + "loss": 0.087, + "step": 33080 + }, + { + "epoch": 2.1648675171736995, + "grad_norm": 0.80777907371521, + "learning_rate": 9.51829065411148e-05, + "loss": 0.0824, + "step": 33090 + }, + { + "epoch": 2.1655217533529605, + "grad_norm": 1.0060641765594482, + "learning_rate": 9.517897185629882e-05, + "loss": 0.0832, + "step": 33100 + }, + { + "epoch": 2.166175989532221, + "grad_norm": 0.9100120067596436, + "learning_rate": 9.517503564657407e-05, + "loss": 0.0868, + "step": 33110 + }, + { + "epoch": 2.166830225711482, + "grad_norm": 0.8094409704208374, + "learning_rate": 9.517109791207337e-05, + "loss": 0.0823, + "step": 33120 + }, + { + "epoch": 2.1674844618907425, + "grad_norm": 1.0569559335708618, + "learning_rate": 9.516715865292967e-05, + "loss": 0.0803, + "step": 33130 + }, + { + "epoch": 2.1681386980700035, + "grad_norm": 0.7555559277534485, + "learning_rate": 9.51632178692759e-05, + "loss": 0.0821, + "step": 33140 + }, + { + "epoch": 2.168792934249264, + "grad_norm": 0.9259467720985413, + "learning_rate": 9.515927556124507e-05, + "loss": 0.0833, + "step": 33150 + }, + { + "epoch": 2.1694471704285245, + "grad_norm": 0.9541639685630798, + "learning_rate": 9.515533172897028e-05, + "loss": 0.0959, + "step": 33160 + }, + { + "epoch": 2.1701014066077855, + "grad_norm": 0.7850528955459595, + "learning_rate": 9.51513863725846e-05, + "loss": 0.0848, + "step": 33170 + }, + { + "epoch": 2.170755642787046, + "grad_norm": 0.921647310256958, + "learning_rate": 9.514743949222122e-05, + "loss": 0.0864, + "step": 33180 + }, + { + "epoch": 2.171409878966307, + "grad_norm": 0.8776901364326477, + "learning_rate": 9.514349108801337e-05, + "loss": 0.1069, + "step": 33190 + }, + { + "epoch": 2.1720641151455675, + "grad_norm": 0.8369219303131104, + "learning_rate": 9.513954116009429e-05, + "loss": 0.0923, + "step": 33200 + }, + { + "epoch": 2.1727183513248285, + "grad_norm": 0.8588852882385254, + "learning_rate": 9.51355897085973e-05, + "loss": 0.0969, + "step": 33210 + }, + { + "epoch": 2.173372587504089, + "grad_norm": 0.8265673518180847, + "learning_rate": 9.513163673365581e-05, + "loss": 0.0817, + "step": 33220 + }, + { + "epoch": 2.1740268236833495, + "grad_norm": 0.8380441069602966, + "learning_rate": 9.512768223540321e-05, + "loss": 0.0827, + "step": 33230 + }, + { + "epoch": 2.1746810598626105, + "grad_norm": 0.7195032835006714, + "learning_rate": 9.512372621397298e-05, + "loss": 0.0922, + "step": 33240 + }, + { + "epoch": 2.175335296041871, + "grad_norm": 0.8967887759208679, + "learning_rate": 9.511976866949864e-05, + "loss": 0.0861, + "step": 33250 + }, + { + "epoch": 2.175989532221132, + "grad_norm": 0.8154705762863159, + "learning_rate": 9.51158096021138e-05, + "loss": 0.0936, + "step": 33260 + }, + { + "epoch": 2.1766437684003925, + "grad_norm": 0.8090271353721619, + "learning_rate": 9.511184901195204e-05, + "loss": 0.084, + "step": 33270 + }, + { + "epoch": 2.177298004579653, + "grad_norm": 0.8615165948867798, + "learning_rate": 9.510788689914707e-05, + "loss": 0.0891, + "step": 33280 + }, + { + "epoch": 2.177952240758914, + "grad_norm": 0.8889568448066711, + "learning_rate": 9.510392326383262e-05, + "loss": 0.0916, + "step": 33290 + }, + { + "epoch": 2.1786064769381746, + "grad_norm": 0.9349720478057861, + "learning_rate": 9.509995810614247e-05, + "loss": 0.0903, + "step": 33300 + }, + { + "epoch": 2.1792607131174355, + "grad_norm": 0.8932443261146545, + "learning_rate": 9.509599142621047e-05, + "loss": 0.0844, + "step": 33310 + }, + { + "epoch": 2.179914949296696, + "grad_norm": 0.8143129944801331, + "learning_rate": 9.509202322417047e-05, + "loss": 0.087, + "step": 33320 + }, + { + "epoch": 2.180569185475957, + "grad_norm": 0.8007728457450867, + "learning_rate": 9.508805350015643e-05, + "loss": 0.0925, + "step": 33330 + }, + { + "epoch": 2.1812234216552175, + "grad_norm": 0.8921264410018921, + "learning_rate": 9.508408225430237e-05, + "loss": 0.0925, + "step": 33340 + }, + { + "epoch": 2.181877657834478, + "grad_norm": 0.9129869341850281, + "learning_rate": 9.508010948674227e-05, + "loss": 0.0911, + "step": 33350 + }, + { + "epoch": 2.182531894013739, + "grad_norm": 0.9659237265586853, + "learning_rate": 9.507613519761022e-05, + "loss": 0.0915, + "step": 33360 + }, + { + "epoch": 2.1831861301929996, + "grad_norm": 0.8064131736755371, + "learning_rate": 9.507215938704043e-05, + "loss": 0.0845, + "step": 33370 + }, + { + "epoch": 2.1838403663722605, + "grad_norm": 0.9400182366371155, + "learning_rate": 9.506818205516705e-05, + "loss": 0.0886, + "step": 33380 + }, + { + "epoch": 2.184494602551521, + "grad_norm": 0.8010856509208679, + "learning_rate": 9.506420320212433e-05, + "loss": 0.0912, + "step": 33390 + }, + { + "epoch": 2.1851488387307816, + "grad_norm": 0.7543482184410095, + "learning_rate": 9.506022282804656e-05, + "loss": 0.0765, + "step": 33400 + }, + { + "epoch": 2.1858030749100426, + "grad_norm": 0.8534349799156189, + "learning_rate": 9.505624093306809e-05, + "loss": 0.0913, + "step": 33410 + }, + { + "epoch": 2.186457311089303, + "grad_norm": 0.8279025554656982, + "learning_rate": 9.505225751732333e-05, + "loss": 0.0815, + "step": 33420 + }, + { + "epoch": 2.187111547268564, + "grad_norm": 0.8194913268089294, + "learning_rate": 9.504827258094673e-05, + "loss": 0.0924, + "step": 33430 + }, + { + "epoch": 2.1877657834478246, + "grad_norm": 0.9188143014907837, + "learning_rate": 9.504428612407277e-05, + "loss": 0.0892, + "step": 33440 + }, + { + "epoch": 2.1884200196270855, + "grad_norm": 0.6940245032310486, + "learning_rate": 9.504029814683603e-05, + "loss": 0.0797, + "step": 33450 + }, + { + "epoch": 2.189074255806346, + "grad_norm": 0.8881454467773438, + "learning_rate": 9.503630864937112e-05, + "loss": 0.0794, + "step": 33460 + }, + { + "epoch": 2.1897284919856066, + "grad_norm": 0.8755089044570923, + "learning_rate": 9.503231763181266e-05, + "loss": 0.0877, + "step": 33470 + }, + { + "epoch": 2.1903827281648676, + "grad_norm": 0.7432528734207153, + "learning_rate": 9.502832509429538e-05, + "loss": 0.0857, + "step": 33480 + }, + { + "epoch": 2.191036964344128, + "grad_norm": 0.7518520355224609, + "learning_rate": 9.502433103695405e-05, + "loss": 0.0862, + "step": 33490 + }, + { + "epoch": 2.191691200523389, + "grad_norm": 0.7373502254486084, + "learning_rate": 9.502033545992347e-05, + "loss": 0.0839, + "step": 33500 + }, + { + "epoch": 2.1923454367026496, + "grad_norm": 0.8541769981384277, + "learning_rate": 9.501633836333847e-05, + "loss": 0.0858, + "step": 33510 + }, + { + "epoch": 2.1929996728819106, + "grad_norm": 0.8180940747261047, + "learning_rate": 9.501233974733402e-05, + "loss": 0.0857, + "step": 33520 + }, + { + "epoch": 2.193653909061171, + "grad_norm": 0.6392052173614502, + "learning_rate": 9.500833961204504e-05, + "loss": 0.0873, + "step": 33530 + }, + { + "epoch": 2.1943081452404316, + "grad_norm": 0.8578611016273499, + "learning_rate": 9.500433795760656e-05, + "loss": 0.0951, + "step": 33540 + }, + { + "epoch": 2.1949623814196926, + "grad_norm": 0.7681283354759216, + "learning_rate": 9.500033478415364e-05, + "loss": 0.0896, + "step": 33550 + }, + { + "epoch": 2.195616617598953, + "grad_norm": 1.0089340209960938, + "learning_rate": 9.499633009182141e-05, + "loss": 0.0847, + "step": 33560 + }, + { + "epoch": 2.196270853778214, + "grad_norm": 0.9923962950706482, + "learning_rate": 9.499232388074503e-05, + "loss": 0.0849, + "step": 33570 + }, + { + "epoch": 2.1969250899574746, + "grad_norm": 0.9396527409553528, + "learning_rate": 9.498831615105974e-05, + "loss": 0.0817, + "step": 33580 + }, + { + "epoch": 2.1975793261367356, + "grad_norm": 0.7648763060569763, + "learning_rate": 9.498430690290078e-05, + "loss": 0.0966, + "step": 33590 + }, + { + "epoch": 2.198233562315996, + "grad_norm": 0.8955914974212646, + "learning_rate": 9.498029613640349e-05, + "loss": 0.1008, + "step": 33600 + }, + { + "epoch": 2.1988877984952566, + "grad_norm": 1.131964921951294, + "learning_rate": 9.497628385170323e-05, + "loss": 0.0985, + "step": 33610 + }, + { + "epoch": 2.1995420346745176, + "grad_norm": 0.738293468952179, + "learning_rate": 9.497227004893544e-05, + "loss": 0.0794, + "step": 33620 + }, + { + "epoch": 2.200196270853778, + "grad_norm": 0.7371350526809692, + "learning_rate": 9.496825472823559e-05, + "loss": 0.0758, + "step": 33630 + }, + { + "epoch": 2.200850507033039, + "grad_norm": 0.8156116604804993, + "learning_rate": 9.496423788973922e-05, + "loss": 0.0796, + "step": 33640 + }, + { + "epoch": 2.2015047432122996, + "grad_norm": 0.832074761390686, + "learning_rate": 9.496021953358189e-05, + "loss": 0.0778, + "step": 33650 + }, + { + "epoch": 2.2021589793915606, + "grad_norm": 0.9009979963302612, + "learning_rate": 9.495619965989924e-05, + "loss": 0.0953, + "step": 33660 + }, + { + "epoch": 2.202813215570821, + "grad_norm": 0.9261824488639832, + "learning_rate": 9.495217826882694e-05, + "loss": 0.0941, + "step": 33670 + }, + { + "epoch": 2.2034674517500816, + "grad_norm": 0.815540611743927, + "learning_rate": 9.494815536050075e-05, + "loss": 0.0858, + "step": 33680 + }, + { + "epoch": 2.2041216879293426, + "grad_norm": 0.8841716647148132, + "learning_rate": 9.494413093505643e-05, + "loss": 0.0789, + "step": 33690 + }, + { + "epoch": 2.204775924108603, + "grad_norm": 0.9141961932182312, + "learning_rate": 9.494010499262982e-05, + "loss": 0.0936, + "step": 33700 + }, + { + "epoch": 2.205430160287864, + "grad_norm": 0.7224341630935669, + "learning_rate": 9.49360775333568e-05, + "loss": 0.0938, + "step": 33710 + }, + { + "epoch": 2.2060843964671246, + "grad_norm": 0.9036959409713745, + "learning_rate": 9.493204855737332e-05, + "loss": 0.0769, + "step": 33720 + }, + { + "epoch": 2.206738632646385, + "grad_norm": 0.9776110649108887, + "learning_rate": 9.492801806481535e-05, + "loss": 0.0925, + "step": 33730 + }, + { + "epoch": 2.207392868825646, + "grad_norm": 0.6493380665779114, + "learning_rate": 9.492398605581896e-05, + "loss": 0.0948, + "step": 33740 + }, + { + "epoch": 2.2080471050049066, + "grad_norm": 0.8940728902816772, + "learning_rate": 9.491995253052022e-05, + "loss": 0.0949, + "step": 33750 + }, + { + "epoch": 2.2087013411841676, + "grad_norm": 0.9045453071594238, + "learning_rate": 9.491591748905527e-05, + "loss": 0.0905, + "step": 33760 + }, + { + "epoch": 2.209355577363428, + "grad_norm": 0.7662135362625122, + "learning_rate": 9.49118809315603e-05, + "loss": 0.0852, + "step": 33770 + }, + { + "epoch": 2.210009813542689, + "grad_norm": 0.7903282642364502, + "learning_rate": 9.490784285817158e-05, + "loss": 0.0792, + "step": 33780 + }, + { + "epoch": 2.2106640497219496, + "grad_norm": 0.9852091073989868, + "learning_rate": 9.490380326902537e-05, + "loss": 0.0826, + "step": 33790 + }, + { + "epoch": 2.21131828590121, + "grad_norm": 0.9630650281906128, + "learning_rate": 9.489976216425804e-05, + "loss": 0.0994, + "step": 33800 + }, + { + "epoch": 2.211972522080471, + "grad_norm": 0.7870905995368958, + "learning_rate": 9.4895719544006e-05, + "loss": 0.0926, + "step": 33810 + }, + { + "epoch": 2.2126267582597317, + "grad_norm": 0.8298508524894714, + "learning_rate": 9.489167540840567e-05, + "loss": 0.0815, + "step": 33820 + }, + { + "epoch": 2.2132809944389926, + "grad_norm": 0.8366494178771973, + "learning_rate": 9.488762975759357e-05, + "loss": 0.1, + "step": 33830 + }, + { + "epoch": 2.213935230618253, + "grad_norm": 0.9144079685211182, + "learning_rate": 9.488358259170622e-05, + "loss": 0.0881, + "step": 33840 + }, + { + "epoch": 2.2145894667975137, + "grad_norm": 0.8029112815856934, + "learning_rate": 9.487953391088027e-05, + "loss": 0.0793, + "step": 33850 + }, + { + "epoch": 2.2152437029767746, + "grad_norm": 0.9416177272796631, + "learning_rate": 9.487548371525234e-05, + "loss": 0.0827, + "step": 33860 + }, + { + "epoch": 2.215897939156035, + "grad_norm": 0.7527725696563721, + "learning_rate": 9.487143200495914e-05, + "loss": 0.0921, + "step": 33870 + }, + { + "epoch": 2.216552175335296, + "grad_norm": 1.0076922178268433, + "learning_rate": 9.486737878013745e-05, + "loss": 0.0852, + "step": 33880 + }, + { + "epoch": 2.2172064115145567, + "grad_norm": 1.0222320556640625, + "learning_rate": 9.486332404092403e-05, + "loss": 0.09, + "step": 33890 + }, + { + "epoch": 2.2178606476938176, + "grad_norm": 0.9221048951148987, + "learning_rate": 9.485926778745579e-05, + "loss": 0.091, + "step": 33900 + }, + { + "epoch": 2.218514883873078, + "grad_norm": 0.8487015962600708, + "learning_rate": 9.485521001986962e-05, + "loss": 0.0853, + "step": 33910 + }, + { + "epoch": 2.2191691200523387, + "grad_norm": 0.8243840336799622, + "learning_rate": 9.485115073830245e-05, + "loss": 0.0884, + "step": 33920 + }, + { + "epoch": 2.2198233562315997, + "grad_norm": 0.8864408731460571, + "learning_rate": 9.48470899428913e-05, + "loss": 0.1056, + "step": 33930 + }, + { + "epoch": 2.22047759241086, + "grad_norm": 1.3254485130310059, + "learning_rate": 9.484302763377328e-05, + "loss": 0.0855, + "step": 33940 + }, + { + "epoch": 2.221131828590121, + "grad_norm": 1.0001471042633057, + "learning_rate": 9.483896381108548e-05, + "loss": 0.0901, + "step": 33950 + }, + { + "epoch": 2.2217860647693817, + "grad_norm": 0.8576057553291321, + "learning_rate": 9.483489847496503e-05, + "loss": 0.0802, + "step": 33960 + }, + { + "epoch": 2.2224403009486426, + "grad_norm": 0.8410997986793518, + "learning_rate": 9.48308316255492e-05, + "loss": 0.0777, + "step": 33970 + }, + { + "epoch": 2.223094537127903, + "grad_norm": 1.0738435983657837, + "learning_rate": 9.482676326297522e-05, + "loss": 0.093, + "step": 33980 + }, + { + "epoch": 2.2237487733071637, + "grad_norm": 0.7275444865226746, + "learning_rate": 9.482269338738038e-05, + "loss": 0.0788, + "step": 33990 + }, + { + "epoch": 2.2244030094864247, + "grad_norm": 0.7812187075614929, + "learning_rate": 9.481862199890213e-05, + "loss": 0.0866, + "step": 34000 + }, + { + "epoch": 2.225057245665685, + "grad_norm": 0.7161325216293335, + "learning_rate": 9.481454909767784e-05, + "loss": 0.0865, + "step": 34010 + }, + { + "epoch": 2.225711481844946, + "grad_norm": 0.9743297696113586, + "learning_rate": 9.481047468384499e-05, + "loss": 0.0829, + "step": 34020 + }, + { + "epoch": 2.2263657180242067, + "grad_norm": 0.9673652648925781, + "learning_rate": 9.480639875754108e-05, + "loss": 0.0828, + "step": 34030 + }, + { + "epoch": 2.2270199542034677, + "grad_norm": 0.8574245572090149, + "learning_rate": 9.480232131890371e-05, + "loss": 0.0832, + "step": 34040 + }, + { + "epoch": 2.227674190382728, + "grad_norm": 1.0479404926300049, + "learning_rate": 9.479824236807051e-05, + "loss": 0.0825, + "step": 34050 + }, + { + "epoch": 2.2283284265619887, + "grad_norm": 0.9028341174125671, + "learning_rate": 9.479416190517914e-05, + "loss": 0.0804, + "step": 34060 + }, + { + "epoch": 2.2289826627412497, + "grad_norm": 0.7553834915161133, + "learning_rate": 9.479007993036733e-05, + "loss": 0.0798, + "step": 34070 + }, + { + "epoch": 2.22963689892051, + "grad_norm": 0.9540044069290161, + "learning_rate": 9.478599644377284e-05, + "loss": 0.092, + "step": 34080 + }, + { + "epoch": 2.230291135099771, + "grad_norm": 1.0096077919006348, + "learning_rate": 9.478191144553352e-05, + "loss": 0.0833, + "step": 34090 + }, + { + "epoch": 2.2309453712790317, + "grad_norm": 0.8619536757469177, + "learning_rate": 9.477782493578725e-05, + "loss": 0.0983, + "step": 34100 + }, + { + "epoch": 2.2315996074582927, + "grad_norm": 0.8202316761016846, + "learning_rate": 9.477373691467195e-05, + "loss": 0.0795, + "step": 34110 + }, + { + "epoch": 2.232253843637553, + "grad_norm": 0.8354489207267761, + "learning_rate": 9.47696473823256e-05, + "loss": 0.0805, + "step": 34120 + }, + { + "epoch": 2.2329080798168137, + "grad_norm": 1.0788700580596924, + "learning_rate": 9.476555633888625e-05, + "loss": 0.0929, + "step": 34130 + }, + { + "epoch": 2.2335623159960747, + "grad_norm": 0.8058347702026367, + "learning_rate": 9.476146378449197e-05, + "loss": 0.0913, + "step": 34140 + }, + { + "epoch": 2.234216552175335, + "grad_norm": 0.8646268844604492, + "learning_rate": 9.475736971928088e-05, + "loss": 0.0819, + "step": 34150 + }, + { + "epoch": 2.234870788354596, + "grad_norm": 0.8829324841499329, + "learning_rate": 9.475327414339121e-05, + "loss": 0.0907, + "step": 34160 + }, + { + "epoch": 2.2355250245338567, + "grad_norm": 0.8465047478675842, + "learning_rate": 9.474917705696114e-05, + "loss": 0.0884, + "step": 34170 + }, + { + "epoch": 2.2361792607131172, + "grad_norm": 0.8332507014274597, + "learning_rate": 9.474507846012901e-05, + "loss": 0.0899, + "step": 34180 + }, + { + "epoch": 2.236833496892378, + "grad_norm": 0.890265941619873, + "learning_rate": 9.474097835303311e-05, + "loss": 0.0799, + "step": 34190 + }, + { + "epoch": 2.2374877330716387, + "grad_norm": 0.9846871495246887, + "learning_rate": 9.473687673581186e-05, + "loss": 0.0828, + "step": 34200 + }, + { + "epoch": 2.2381419692508997, + "grad_norm": 0.7665266394615173, + "learning_rate": 9.47327736086037e-05, + "loss": 0.0792, + "step": 34210 + }, + { + "epoch": 2.2387962054301602, + "grad_norm": 0.82786625623703, + "learning_rate": 9.472866897154712e-05, + "loss": 0.0888, + "step": 34220 + }, + { + "epoch": 2.239450441609421, + "grad_norm": 0.8131442666053772, + "learning_rate": 9.472456282478065e-05, + "loss": 0.0819, + "step": 34230 + }, + { + "epoch": 2.2401046777886817, + "grad_norm": 0.8473891615867615, + "learning_rate": 9.47204551684429e-05, + "loss": 0.0905, + "step": 34240 + }, + { + "epoch": 2.2407589139679422, + "grad_norm": 0.8712317943572998, + "learning_rate": 9.471634600267247e-05, + "loss": 0.0955, + "step": 34250 + }, + { + "epoch": 2.241413150147203, + "grad_norm": 0.7699024081230164, + "learning_rate": 9.471223532760812e-05, + "loss": 0.0799, + "step": 34260 + }, + { + "epoch": 2.2420673863264637, + "grad_norm": 0.7678673267364502, + "learning_rate": 9.470812314338855e-05, + "loss": 0.0846, + "step": 34270 + }, + { + "epoch": 2.2427216225057247, + "grad_norm": 0.8645968437194824, + "learning_rate": 9.470400945015258e-05, + "loss": 0.0767, + "step": 34280 + }, + { + "epoch": 2.2433758586849852, + "grad_norm": 0.9479940533638, + "learning_rate": 9.469989424803907e-05, + "loss": 0.0875, + "step": 34290 + }, + { + "epoch": 2.2440300948642458, + "grad_norm": 0.7931011319160461, + "learning_rate": 9.469577753718689e-05, + "loss": 0.0817, + "step": 34300 + }, + { + "epoch": 2.2446843310435067, + "grad_norm": 0.9428794980049133, + "learning_rate": 9.469165931773498e-05, + "loss": 0.0824, + "step": 34310 + }, + { + "epoch": 2.2453385672227673, + "grad_norm": 0.9132287502288818, + "learning_rate": 9.468753958982238e-05, + "loss": 0.0886, + "step": 34320 + }, + { + "epoch": 2.2459928034020282, + "grad_norm": 0.9737602472305298, + "learning_rate": 9.468341835358809e-05, + "loss": 0.0937, + "step": 34330 + }, + { + "epoch": 2.2466470395812888, + "grad_norm": 0.918177604675293, + "learning_rate": 9.467929560917128e-05, + "loss": 0.0813, + "step": 34340 + }, + { + "epoch": 2.2473012757605497, + "grad_norm": 0.7503966093063354, + "learning_rate": 9.467517135671104e-05, + "loss": 0.0886, + "step": 34350 + }, + { + "epoch": 2.2479555119398102, + "grad_norm": 0.83729487657547, + "learning_rate": 9.467104559634663e-05, + "loss": 0.0983, + "step": 34360 + }, + { + "epoch": 2.2486097481190708, + "grad_norm": 1.0300636291503906, + "learning_rate": 9.466691832821725e-05, + "loss": 0.0959, + "step": 34370 + }, + { + "epoch": 2.2492639842983317, + "grad_norm": 0.8676597476005554, + "learning_rate": 9.466278955246225e-05, + "loss": 0.0833, + "step": 34380 + }, + { + "epoch": 2.2499182204775923, + "grad_norm": 0.9559263586997986, + "learning_rate": 9.465865926922098e-05, + "loss": 0.0978, + "step": 34390 + }, + { + "epoch": 2.2505724566568532, + "grad_norm": 0.7178763151168823, + "learning_rate": 9.465452747863281e-05, + "loss": 0.0909, + "step": 34400 + }, + { + "epoch": 2.2512266928361138, + "grad_norm": 0.7472977638244629, + "learning_rate": 9.465039418083723e-05, + "loss": 0.0954, + "step": 34410 + }, + { + "epoch": 2.2518809290153747, + "grad_norm": 0.766511082649231, + "learning_rate": 9.464625937597377e-05, + "loss": 0.0853, + "step": 34420 + }, + { + "epoch": 2.2525351651946353, + "grad_norm": 0.9715924263000488, + "learning_rate": 9.464212306418194e-05, + "loss": 0.0931, + "step": 34430 + }, + { + "epoch": 2.253189401373896, + "grad_norm": 0.8225154280662537, + "learning_rate": 9.463798524560141e-05, + "loss": 0.0927, + "step": 34440 + }, + { + "epoch": 2.2538436375531568, + "grad_norm": 0.7842962145805359, + "learning_rate": 9.463384592037178e-05, + "loss": 0.0843, + "step": 34450 + }, + { + "epoch": 2.2544978737324173, + "grad_norm": 0.817205011844635, + "learning_rate": 9.46297050886328e-05, + "loss": 0.0868, + "step": 34460 + }, + { + "epoch": 2.2551521099116782, + "grad_norm": 0.8969591856002808, + "learning_rate": 9.462556275052425e-05, + "loss": 0.0888, + "step": 34470 + }, + { + "epoch": 2.2558063460909388, + "grad_norm": 0.924565851688385, + "learning_rate": 9.46214189061859e-05, + "loss": 0.0763, + "step": 34480 + }, + { + "epoch": 2.2564605822701997, + "grad_norm": 0.7565225958824158, + "learning_rate": 9.461727355575764e-05, + "loss": 0.0943, + "step": 34490 + }, + { + "epoch": 2.2571148184494603, + "grad_norm": 0.9678487181663513, + "learning_rate": 9.461312669937938e-05, + "loss": 0.1002, + "step": 34500 + }, + { + "epoch": 2.257769054628721, + "grad_norm": 1.0543937683105469, + "learning_rate": 9.460897833719111e-05, + "loss": 0.0908, + "step": 34510 + }, + { + "epoch": 2.2584232908079818, + "grad_norm": 0.8663365840911865, + "learning_rate": 9.460482846933283e-05, + "loss": 0.0793, + "step": 34520 + }, + { + "epoch": 2.2590775269872423, + "grad_norm": 0.9217035174369812, + "learning_rate": 9.460067709594459e-05, + "loss": 0.0862, + "step": 34530 + }, + { + "epoch": 2.2597317631665033, + "grad_norm": 0.8142322301864624, + "learning_rate": 9.459652421716654e-05, + "loss": 0.0895, + "step": 34540 + }, + { + "epoch": 2.260385999345764, + "grad_norm": 0.7949792146682739, + "learning_rate": 9.459236983313884e-05, + "loss": 0.0831, + "step": 34550 + }, + { + "epoch": 2.2610402355250248, + "grad_norm": 1.0977342128753662, + "learning_rate": 9.45882139440017e-05, + "loss": 0.079, + "step": 34560 + }, + { + "epoch": 2.2616944717042853, + "grad_norm": 0.8566722869873047, + "learning_rate": 9.458405654989542e-05, + "loss": 0.0799, + "step": 34570 + }, + { + "epoch": 2.262348707883546, + "grad_norm": 0.9690262079238892, + "learning_rate": 9.457989765096028e-05, + "loss": 0.0845, + "step": 34580 + }, + { + "epoch": 2.2630029440628068, + "grad_norm": 0.684556245803833, + "learning_rate": 9.45757372473367e-05, + "loss": 0.0934, + "step": 34590 + }, + { + "epoch": 2.2636571802420673, + "grad_norm": 0.804113507270813, + "learning_rate": 9.457157533916508e-05, + "loss": 0.0828, + "step": 34600 + }, + { + "epoch": 2.2643114164213283, + "grad_norm": 0.8007469177246094, + "learning_rate": 9.456741192658589e-05, + "loss": 0.0922, + "step": 34610 + }, + { + "epoch": 2.264965652600589, + "grad_norm": 1.1300758123397827, + "learning_rate": 9.456324700973966e-05, + "loss": 0.0841, + "step": 34620 + }, + { + "epoch": 2.2656198887798498, + "grad_norm": 0.7691265344619751, + "learning_rate": 9.4559080588767e-05, + "loss": 0.088, + "step": 34630 + }, + { + "epoch": 2.2662741249591103, + "grad_norm": 0.6707723140716553, + "learning_rate": 9.455491266380849e-05, + "loss": 0.0842, + "step": 34640 + }, + { + "epoch": 2.266928361138371, + "grad_norm": 0.848480224609375, + "learning_rate": 9.455074323500484e-05, + "loss": 0.0938, + "step": 34650 + }, + { + "epoch": 2.267582597317632, + "grad_norm": 0.9524247050285339, + "learning_rate": 9.454657230249675e-05, + "loss": 0.086, + "step": 34660 + }, + { + "epoch": 2.2682368334968923, + "grad_norm": 0.8271149396896362, + "learning_rate": 9.454239986642502e-05, + "loss": 0.086, + "step": 34670 + }, + { + "epoch": 2.2688910696761533, + "grad_norm": 0.9054349064826965, + "learning_rate": 9.453822592693049e-05, + "loss": 0.0883, + "step": 34680 + }, + { + "epoch": 2.269545305855414, + "grad_norm": 0.9008976221084595, + "learning_rate": 9.453405048415402e-05, + "loss": 0.084, + "step": 34690 + }, + { + "epoch": 2.2701995420346743, + "grad_norm": 0.7716291546821594, + "learning_rate": 9.452987353823654e-05, + "loss": 0.0858, + "step": 34700 + }, + { + "epoch": 2.2708537782139353, + "grad_norm": 0.7925845384597778, + "learning_rate": 9.452569508931908e-05, + "loss": 0.0862, + "step": 34710 + }, + { + "epoch": 2.271508014393196, + "grad_norm": 0.7443545460700989, + "learning_rate": 9.452151513754262e-05, + "loss": 0.0817, + "step": 34720 + }, + { + "epoch": 2.272162250572457, + "grad_norm": 0.7486448884010315, + "learning_rate": 9.451733368304825e-05, + "loss": 0.0779, + "step": 34730 + }, + { + "epoch": 2.2728164867517173, + "grad_norm": 0.8984254002571106, + "learning_rate": 9.451315072597713e-05, + "loss": 0.0854, + "step": 34740 + }, + { + "epoch": 2.273470722930978, + "grad_norm": 0.880130410194397, + "learning_rate": 9.450896626647041e-05, + "loss": 0.09, + "step": 34750 + }, + { + "epoch": 2.274124959110239, + "grad_norm": 0.8903000354766846, + "learning_rate": 9.450478030466938e-05, + "loss": 0.0876, + "step": 34760 + }, + { + "epoch": 2.2747791952894993, + "grad_norm": 0.8735268115997314, + "learning_rate": 9.450059284071529e-05, + "loss": 0.1018, + "step": 34770 + }, + { + "epoch": 2.2754334314687603, + "grad_norm": 0.7384045720100403, + "learning_rate": 9.449640387474948e-05, + "loss": 0.0849, + "step": 34780 + }, + { + "epoch": 2.276087667648021, + "grad_norm": 0.8934576511383057, + "learning_rate": 9.449221340691333e-05, + "loss": 0.0834, + "step": 34790 + }, + { + "epoch": 2.276741903827282, + "grad_norm": 1.060774564743042, + "learning_rate": 9.448802143734831e-05, + "loss": 0.0932, + "step": 34800 + }, + { + "epoch": 2.2773961400065423, + "grad_norm": 0.825049638748169, + "learning_rate": 9.448382796619589e-05, + "loss": 0.0854, + "step": 34810 + }, + { + "epoch": 2.278050376185803, + "grad_norm": 0.8421124815940857, + "learning_rate": 9.44796329935976e-05, + "loss": 0.0836, + "step": 34820 + }, + { + "epoch": 2.278704612365064, + "grad_norm": 0.7823123335838318, + "learning_rate": 9.447543651969506e-05, + "loss": 0.0852, + "step": 34830 + }, + { + "epoch": 2.2793588485443244, + "grad_norm": 1.0496047735214233, + "learning_rate": 9.447123854462989e-05, + "loss": 0.0891, + "step": 34840 + }, + { + "epoch": 2.2800130847235853, + "grad_norm": 0.8308643698692322, + "learning_rate": 9.44670390685438e-05, + "loss": 0.0831, + "step": 34850 + }, + { + "epoch": 2.280667320902846, + "grad_norm": 0.8488014936447144, + "learning_rate": 9.44628380915785e-05, + "loss": 0.0911, + "step": 34860 + }, + { + "epoch": 2.281321557082107, + "grad_norm": 0.9563519358634949, + "learning_rate": 9.445863561387582e-05, + "loss": 0.0957, + "step": 34870 + }, + { + "epoch": 2.2819757932613673, + "grad_norm": 0.8364535570144653, + "learning_rate": 9.44544316355776e-05, + "loss": 0.0821, + "step": 34880 + }, + { + "epoch": 2.282630029440628, + "grad_norm": 0.918013870716095, + "learning_rate": 9.445022615682571e-05, + "loss": 0.0789, + "step": 34890 + }, + { + "epoch": 2.283284265619889, + "grad_norm": 0.7931444048881531, + "learning_rate": 9.44460191777621e-05, + "loss": 0.0908, + "step": 34900 + }, + { + "epoch": 2.2839385017991494, + "grad_norm": 0.9308255314826965, + "learning_rate": 9.44418106985288e-05, + "loss": 0.0819, + "step": 34910 + }, + { + "epoch": 2.2845927379784103, + "grad_norm": 1.0205659866333008, + "learning_rate": 9.443760071926784e-05, + "loss": 0.0908, + "step": 34920 + }, + { + "epoch": 2.285246974157671, + "grad_norm": 0.8457900881767273, + "learning_rate": 9.44333892401213e-05, + "loss": 0.0996, + "step": 34930 + }, + { + "epoch": 2.285901210336932, + "grad_norm": 0.8027629852294922, + "learning_rate": 9.442917626123136e-05, + "loss": 0.0913, + "step": 34940 + }, + { + "epoch": 2.2865554465161924, + "grad_norm": 0.8995937705039978, + "learning_rate": 9.442496178274019e-05, + "loss": 0.0892, + "step": 34950 + }, + { + "epoch": 2.287209682695453, + "grad_norm": 0.9904996752738953, + "learning_rate": 9.442074580479004e-05, + "loss": 0.0921, + "step": 34960 + }, + { + "epoch": 2.287863918874714, + "grad_norm": 0.9785271286964417, + "learning_rate": 9.441652832752324e-05, + "loss": 0.0801, + "step": 34970 + }, + { + "epoch": 2.2885181550539744, + "grad_norm": 0.8374738097190857, + "learning_rate": 9.441230935108212e-05, + "loss": 0.0855, + "step": 34980 + }, + { + "epoch": 2.2891723912332353, + "grad_norm": 0.8071938157081604, + "learning_rate": 9.440808887560907e-05, + "loss": 0.0967, + "step": 34990 + }, + { + "epoch": 2.289826627412496, + "grad_norm": 1.0699573755264282, + "learning_rate": 9.440386690124656e-05, + "loss": 0.0865, + "step": 35000 + }, + { + "epoch": 2.290480863591757, + "grad_norm": 1.1547331809997559, + "learning_rate": 9.43996434281371e-05, + "loss": 0.085, + "step": 35010 + }, + { + "epoch": 2.2911350997710174, + "grad_norm": 0.8544806838035583, + "learning_rate": 9.439541845642322e-05, + "loss": 0.0808, + "step": 35020 + }, + { + "epoch": 2.291789335950278, + "grad_norm": 0.8750424385070801, + "learning_rate": 9.439119198624755e-05, + "loss": 0.0826, + "step": 35030 + }, + { + "epoch": 2.292443572129539, + "grad_norm": 1.0498063564300537, + "learning_rate": 9.438696401775271e-05, + "loss": 0.0912, + "step": 35040 + }, + { + "epoch": 2.2930978083087994, + "grad_norm": 0.7838863730430603, + "learning_rate": 9.438273455108144e-05, + "loss": 0.0985, + "step": 35050 + }, + { + "epoch": 2.2937520444880604, + "grad_norm": 0.7744349241256714, + "learning_rate": 9.437850358637648e-05, + "loss": 0.0799, + "step": 35060 + }, + { + "epoch": 2.294406280667321, + "grad_norm": 0.7421576380729675, + "learning_rate": 9.437427112378063e-05, + "loss": 0.0907, + "step": 35070 + }, + { + "epoch": 2.295060516846582, + "grad_norm": 0.7675076723098755, + "learning_rate": 9.437003716343676e-05, + "loss": 0.0768, + "step": 35080 + }, + { + "epoch": 2.2957147530258424, + "grad_norm": 0.6893305778503418, + "learning_rate": 9.436580170548777e-05, + "loss": 0.076, + "step": 35090 + }, + { + "epoch": 2.296368989205103, + "grad_norm": 0.7479074001312256, + "learning_rate": 9.436156475007662e-05, + "loss": 0.0824, + "step": 35100 + }, + { + "epoch": 2.297023225384364, + "grad_norm": 0.7704381346702576, + "learning_rate": 9.435732629734633e-05, + "loss": 0.0882, + "step": 35110 + }, + { + "epoch": 2.2976774615636244, + "grad_norm": 0.9246371984481812, + "learning_rate": 9.435308634743992e-05, + "loss": 0.0812, + "step": 35120 + }, + { + "epoch": 2.2983316977428854, + "grad_norm": 0.8029863834381104, + "learning_rate": 9.434884490050053e-05, + "loss": 0.0976, + "step": 35130 + }, + { + "epoch": 2.298985933922146, + "grad_norm": 0.8177743554115295, + "learning_rate": 9.434460195667133e-05, + "loss": 0.0784, + "step": 35140 + }, + { + "epoch": 2.2996401701014064, + "grad_norm": 0.8609619140625, + "learning_rate": 9.434035751609551e-05, + "loss": 0.1004, + "step": 35150 + }, + { + "epoch": 2.3002944062806674, + "grad_norm": 0.7648687958717346, + "learning_rate": 9.433611157891633e-05, + "loss": 0.085, + "step": 35160 + }, + { + "epoch": 2.300948642459928, + "grad_norm": 0.9224421977996826, + "learning_rate": 9.433186414527713e-05, + "loss": 0.0882, + "step": 35170 + }, + { + "epoch": 2.301602878639189, + "grad_norm": 0.793401837348938, + "learning_rate": 9.432761521532123e-05, + "loss": 0.0769, + "step": 35180 + }, + { + "epoch": 2.3022571148184494, + "grad_norm": 0.86360102891922, + "learning_rate": 9.432336478919206e-05, + "loss": 0.0851, + "step": 35190 + }, + { + "epoch": 2.30291135099771, + "grad_norm": 0.860308825969696, + "learning_rate": 9.43191128670331e-05, + "loss": 0.0844, + "step": 35200 + }, + { + "epoch": 2.303565587176971, + "grad_norm": 0.8408114314079285, + "learning_rate": 9.431485944898784e-05, + "loss": 0.0892, + "step": 35210 + }, + { + "epoch": 2.3042198233562314, + "grad_norm": 0.8600111603736877, + "learning_rate": 9.431060453519986e-05, + "loss": 0.0838, + "step": 35220 + }, + { + "epoch": 2.3048740595354924, + "grad_norm": 0.9598779082298279, + "learning_rate": 9.430634812581276e-05, + "loss": 0.0891, + "step": 35230 + }, + { + "epoch": 2.305528295714753, + "grad_norm": 0.9931259155273438, + "learning_rate": 9.430209022097023e-05, + "loss": 0.08, + "step": 35240 + }, + { + "epoch": 2.306182531894014, + "grad_norm": 0.687402069568634, + "learning_rate": 9.429783082081596e-05, + "loss": 0.0882, + "step": 35250 + }, + { + "epoch": 2.3068367680732744, + "grad_norm": 0.8022744059562683, + "learning_rate": 9.429356992549372e-05, + "loss": 0.0828, + "step": 35260 + }, + { + "epoch": 2.307491004252535, + "grad_norm": 0.9008263945579529, + "learning_rate": 9.428930753514734e-05, + "loss": 0.0775, + "step": 35270 + }, + { + "epoch": 2.308145240431796, + "grad_norm": 0.8905544877052307, + "learning_rate": 9.428504364992066e-05, + "loss": 0.0863, + "step": 35280 + }, + { + "epoch": 2.3087994766110564, + "grad_norm": 0.9496570825576782, + "learning_rate": 9.428077826995762e-05, + "loss": 0.0863, + "step": 35290 + }, + { + "epoch": 2.3094537127903174, + "grad_norm": 0.8719865679740906, + "learning_rate": 9.427651139540218e-05, + "loss": 0.0777, + "step": 35300 + }, + { + "epoch": 2.310107948969578, + "grad_norm": 1.0145576000213623, + "learning_rate": 9.427224302639837e-05, + "loss": 0.089, + "step": 35310 + }, + { + "epoch": 2.310762185148839, + "grad_norm": 0.8923918604850769, + "learning_rate": 9.426797316309026e-05, + "loss": 0.0766, + "step": 35320 + }, + { + "epoch": 2.3114164213280994, + "grad_norm": 0.7120718955993652, + "learning_rate": 9.426370180562195e-05, + "loss": 0.0789, + "step": 35330 + }, + { + "epoch": 2.31207065750736, + "grad_norm": 0.9214240312576294, + "learning_rate": 9.425942895413761e-05, + "loss": 0.082, + "step": 35340 + }, + { + "epoch": 2.312724893686621, + "grad_norm": 0.7993736863136292, + "learning_rate": 9.425515460878148e-05, + "loss": 0.078, + "step": 35350 + }, + { + "epoch": 2.3133791298658815, + "grad_norm": 0.7409219741821289, + "learning_rate": 9.42508787696978e-05, + "loss": 0.0859, + "step": 35360 + }, + { + "epoch": 2.3140333660451424, + "grad_norm": 0.8568753600120544, + "learning_rate": 9.424660143703092e-05, + "loss": 0.0974, + "step": 35370 + }, + { + "epoch": 2.314687602224403, + "grad_norm": 0.8471707701683044, + "learning_rate": 9.424232261092521e-05, + "loss": 0.0926, + "step": 35380 + }, + { + "epoch": 2.315341838403664, + "grad_norm": 0.9042620658874512, + "learning_rate": 9.423804229152507e-05, + "loss": 0.084, + "step": 35390 + }, + { + "epoch": 2.3159960745829244, + "grad_norm": 0.9078444838523865, + "learning_rate": 9.423376047897499e-05, + "loss": 0.0903, + "step": 35400 + }, + { + "epoch": 2.316650310762185, + "grad_norm": 0.8419270515441895, + "learning_rate": 9.422947717341948e-05, + "loss": 0.0791, + "step": 35410 + }, + { + "epoch": 2.317304546941446, + "grad_norm": 0.8305777907371521, + "learning_rate": 9.422519237500313e-05, + "loss": 0.092, + "step": 35420 + }, + { + "epoch": 2.3179587831207065, + "grad_norm": 0.7356106638908386, + "learning_rate": 9.422090608387055e-05, + "loss": 0.0809, + "step": 35430 + }, + { + "epoch": 2.3186130192999674, + "grad_norm": 0.9605696201324463, + "learning_rate": 9.421661830016642e-05, + "loss": 0.0746, + "step": 35440 + }, + { + "epoch": 2.319267255479228, + "grad_norm": 1.005142092704773, + "learning_rate": 9.421232902403545e-05, + "loss": 0.0917, + "step": 35450 + }, + { + "epoch": 2.319921491658489, + "grad_norm": 0.965522289276123, + "learning_rate": 9.420803825562243e-05, + "loss": 0.0977, + "step": 35460 + }, + { + "epoch": 2.3205757278377495, + "grad_norm": 0.860283613204956, + "learning_rate": 9.420374599507217e-05, + "loss": 0.0872, + "step": 35470 + }, + { + "epoch": 2.32122996401701, + "grad_norm": 0.834602952003479, + "learning_rate": 9.419945224252955e-05, + "loss": 0.1007, + "step": 35480 + }, + { + "epoch": 2.321884200196271, + "grad_norm": 0.8806980848312378, + "learning_rate": 9.419515699813952e-05, + "loss": 0.0795, + "step": 35490 + }, + { + "epoch": 2.3225384363755315, + "grad_norm": 1.0480656623840332, + "learning_rate": 9.419086026204703e-05, + "loss": 0.0898, + "step": 35500 + }, + { + "epoch": 2.3231926725547924, + "grad_norm": 0.8428975343704224, + "learning_rate": 9.41865620343971e-05, + "loss": 0.0792, + "step": 35510 + }, + { + "epoch": 2.323846908734053, + "grad_norm": 0.9427582621574402, + "learning_rate": 9.418226231533482e-05, + "loss": 0.0796, + "step": 35520 + }, + { + "epoch": 2.324501144913314, + "grad_norm": 0.7207076549530029, + "learning_rate": 9.417796110500532e-05, + "loss": 0.0826, + "step": 35530 + }, + { + "epoch": 2.3251553810925745, + "grad_norm": 0.9341986179351807, + "learning_rate": 9.417365840355377e-05, + "loss": 0.0965, + "step": 35540 + }, + { + "epoch": 2.325809617271835, + "grad_norm": 0.8259122371673584, + "learning_rate": 9.416935421112541e-05, + "loss": 0.0909, + "step": 35550 + }, + { + "epoch": 2.326463853451096, + "grad_norm": 0.8562160134315491, + "learning_rate": 9.41650485278655e-05, + "loss": 0.0918, + "step": 35560 + }, + { + "epoch": 2.3271180896303565, + "grad_norm": 0.9982649683952332, + "learning_rate": 9.416074135391937e-05, + "loss": 0.0832, + "step": 35570 + }, + { + "epoch": 2.3277723258096175, + "grad_norm": 0.9082480072975159, + "learning_rate": 9.415643268943239e-05, + "loss": 0.0969, + "step": 35580 + }, + { + "epoch": 2.328426561988878, + "grad_norm": 0.8344404101371765, + "learning_rate": 9.415212253455004e-05, + "loss": 0.0815, + "step": 35590 + }, + { + "epoch": 2.3290807981681385, + "grad_norm": 0.8839426040649414, + "learning_rate": 9.414781088941772e-05, + "loss": 0.0899, + "step": 35600 + }, + { + "epoch": 2.3297350343473995, + "grad_norm": 0.7677926421165466, + "learning_rate": 9.414349775418104e-05, + "loss": 0.0774, + "step": 35610 + }, + { + "epoch": 2.33038927052666, + "grad_norm": 1.086987853050232, + "learning_rate": 9.413918312898551e-05, + "loss": 0.0909, + "step": 35620 + }, + { + "epoch": 2.331043506705921, + "grad_norm": 0.8547496795654297, + "learning_rate": 9.41348670139768e-05, + "loss": 0.0781, + "step": 35630 + }, + { + "epoch": 2.3316977428851815, + "grad_norm": 0.8789479732513428, + "learning_rate": 9.413054940930057e-05, + "loss": 0.0926, + "step": 35640 + }, + { + "epoch": 2.332351979064442, + "grad_norm": 0.8583962917327881, + "learning_rate": 9.412623031510257e-05, + "loss": 0.0854, + "step": 35650 + }, + { + "epoch": 2.333006215243703, + "grad_norm": 0.8400371670722961, + "learning_rate": 9.412190973152858e-05, + "loss": 0.0908, + "step": 35660 + }, + { + "epoch": 2.3336604514229635, + "grad_norm": 0.8149643540382385, + "learning_rate": 9.411758765872441e-05, + "loss": 0.0837, + "step": 35670 + }, + { + "epoch": 2.3343146876022245, + "grad_norm": 0.7172325849533081, + "learning_rate": 9.411326409683596e-05, + "loss": 0.0828, + "step": 35680 + }, + { + "epoch": 2.334968923781485, + "grad_norm": 0.9276403188705444, + "learning_rate": 9.410893904600917e-05, + "loss": 0.083, + "step": 35690 + }, + { + "epoch": 2.335623159960746, + "grad_norm": 0.7993019223213196, + "learning_rate": 9.410461250638997e-05, + "loss": 0.0831, + "step": 35700 + }, + { + "epoch": 2.3362773961400065, + "grad_norm": 0.7336340546607971, + "learning_rate": 9.410028447812447e-05, + "loss": 0.0866, + "step": 35710 + }, + { + "epoch": 2.336931632319267, + "grad_norm": 0.9019221067428589, + "learning_rate": 9.409595496135869e-05, + "loss": 0.0892, + "step": 35720 + }, + { + "epoch": 2.337585868498528, + "grad_norm": 0.8154904842376709, + "learning_rate": 9.409162395623879e-05, + "loss": 0.0865, + "step": 35730 + }, + { + "epoch": 2.3382401046777885, + "grad_norm": 0.8226253986358643, + "learning_rate": 9.408729146291093e-05, + "loss": 0.0901, + "step": 35740 + }, + { + "epoch": 2.3388943408570495, + "grad_norm": 0.7311269640922546, + "learning_rate": 9.408295748152138e-05, + "loss": 0.0889, + "step": 35750 + }, + { + "epoch": 2.33954857703631, + "grad_norm": 0.8451585173606873, + "learning_rate": 9.40786220122164e-05, + "loss": 0.0848, + "step": 35760 + }, + { + "epoch": 2.340202813215571, + "grad_norm": 1.0941405296325684, + "learning_rate": 9.407428505514233e-05, + "loss": 0.0871, + "step": 35770 + }, + { + "epoch": 2.3408570493948315, + "grad_norm": 0.9233847260475159, + "learning_rate": 9.406994661044554e-05, + "loss": 0.0976, + "step": 35780 + }, + { + "epoch": 2.341511285574092, + "grad_norm": 0.8936124444007874, + "learning_rate": 9.406560667827248e-05, + "loss": 0.0839, + "step": 35790 + }, + { + "epoch": 2.342165521753353, + "grad_norm": 0.9002397060394287, + "learning_rate": 9.406126525876963e-05, + "loss": 0.0781, + "step": 35800 + }, + { + "epoch": 2.3428197579326135, + "grad_norm": 0.7002133131027222, + "learning_rate": 9.405692235208353e-05, + "loss": 0.0845, + "step": 35810 + }, + { + "epoch": 2.3434739941118745, + "grad_norm": 0.8459717035293579, + "learning_rate": 9.405257795836074e-05, + "loss": 0.0769, + "step": 35820 + }, + { + "epoch": 2.344128230291135, + "grad_norm": 1.1364301443099976, + "learning_rate": 9.404823207774791e-05, + "loss": 0.0878, + "step": 35830 + }, + { + "epoch": 2.344782466470396, + "grad_norm": 0.8352745771408081, + "learning_rate": 9.404388471039173e-05, + "loss": 0.0828, + "step": 35840 + }, + { + "epoch": 2.3454367026496565, + "grad_norm": 0.8302096128463745, + "learning_rate": 9.403953585643895e-05, + "loss": 0.0885, + "step": 35850 + }, + { + "epoch": 2.346090938828917, + "grad_norm": 0.7666718363761902, + "learning_rate": 9.403518551603632e-05, + "loss": 0.0813, + "step": 35860 + }, + { + "epoch": 2.346745175008178, + "grad_norm": 0.9522207975387573, + "learning_rate": 9.40308336893307e-05, + "loss": 0.0909, + "step": 35870 + }, + { + "epoch": 2.3473994111874386, + "grad_norm": 0.7051165699958801, + "learning_rate": 9.402648037646895e-05, + "loss": 0.0871, + "step": 35880 + }, + { + "epoch": 2.3480536473666995, + "grad_norm": 0.7525814771652222, + "learning_rate": 9.402212557759805e-05, + "loss": 0.09, + "step": 35890 + }, + { + "epoch": 2.34870788354596, + "grad_norm": 1.1107085943222046, + "learning_rate": 9.401776929286494e-05, + "loss": 0.083, + "step": 35900 + }, + { + "epoch": 2.349362119725221, + "grad_norm": 0.7754766941070557, + "learning_rate": 9.401341152241668e-05, + "loss": 0.0819, + "step": 35910 + }, + { + "epoch": 2.3500163559044815, + "grad_norm": 0.9919909834861755, + "learning_rate": 9.400905226640036e-05, + "loss": 0.0859, + "step": 35920 + }, + { + "epoch": 2.350670592083742, + "grad_norm": 0.7626148462295532, + "learning_rate": 9.40046915249631e-05, + "loss": 0.0889, + "step": 35930 + }, + { + "epoch": 2.351324828263003, + "grad_norm": 0.8603907823562622, + "learning_rate": 9.40003292982521e-05, + "loss": 0.0847, + "step": 35940 + }, + { + "epoch": 2.3519790644422636, + "grad_norm": 0.8849925994873047, + "learning_rate": 9.399596558641459e-05, + "loss": 0.0782, + "step": 35950 + }, + { + "epoch": 2.3526333006215245, + "grad_norm": 0.9044029712677002, + "learning_rate": 9.399160038959785e-05, + "loss": 0.0841, + "step": 35960 + }, + { + "epoch": 2.353287536800785, + "grad_norm": 0.7774613499641418, + "learning_rate": 9.398723370794923e-05, + "loss": 0.0725, + "step": 35970 + }, + { + "epoch": 2.353941772980046, + "grad_norm": 0.7712434530258179, + "learning_rate": 9.398286554161612e-05, + "loss": 0.0855, + "step": 35980 + }, + { + "epoch": 2.3545960091593066, + "grad_norm": 0.8731211423873901, + "learning_rate": 9.397849589074593e-05, + "loss": 0.0941, + "step": 35990 + }, + { + "epoch": 2.355250245338567, + "grad_norm": 0.8799338340759277, + "learning_rate": 9.397412475548618e-05, + "loss": 0.0835, + "step": 36000 + }, + { + "epoch": 2.355904481517828, + "grad_norm": 0.8053388595581055, + "learning_rate": 9.396975213598439e-05, + "loss": 0.0872, + "step": 36010 + }, + { + "epoch": 2.3565587176970886, + "grad_norm": 0.8215650320053101, + "learning_rate": 9.396537803238815e-05, + "loss": 0.0894, + "step": 36020 + }, + { + "epoch": 2.3572129538763495, + "grad_norm": 0.7709128856658936, + "learning_rate": 9.39610024448451e-05, + "loss": 0.0852, + "step": 36030 + }, + { + "epoch": 2.35786719005561, + "grad_norm": 0.8839540481567383, + "learning_rate": 9.395662537350292e-05, + "loss": 0.0874, + "step": 36040 + }, + { + "epoch": 2.3585214262348706, + "grad_norm": 0.8985713124275208, + "learning_rate": 9.395224681850935e-05, + "loss": 0.0774, + "step": 36050 + }, + { + "epoch": 2.3591756624141316, + "grad_norm": 0.7541700601577759, + "learning_rate": 9.39478667800122e-05, + "loss": 0.0782, + "step": 36060 + }, + { + "epoch": 2.359829898593392, + "grad_norm": 0.6992083191871643, + "learning_rate": 9.394348525815928e-05, + "loss": 0.0821, + "step": 36070 + }, + { + "epoch": 2.360484134772653, + "grad_norm": 0.9433765411376953, + "learning_rate": 9.393910225309848e-05, + "loss": 0.0825, + "step": 36080 + }, + { + "epoch": 2.3611383709519136, + "grad_norm": 0.892088770866394, + "learning_rate": 9.393471776497776e-05, + "loss": 0.083, + "step": 36090 + }, + { + "epoch": 2.361792607131174, + "grad_norm": 0.7801177501678467, + "learning_rate": 9.393033179394506e-05, + "loss": 0.0998, + "step": 36100 + }, + { + "epoch": 2.362446843310435, + "grad_norm": 0.8416787981987, + "learning_rate": 9.392594434014847e-05, + "loss": 0.0978, + "step": 36110 + }, + { + "epoch": 2.3631010794896956, + "grad_norm": 0.8340370059013367, + "learning_rate": 9.392155540373606e-05, + "loss": 0.0899, + "step": 36120 + }, + { + "epoch": 2.3637553156689566, + "grad_norm": 0.8223138451576233, + "learning_rate": 9.391716498485597e-05, + "loss": 0.0878, + "step": 36130 + }, + { + "epoch": 2.364409551848217, + "grad_norm": 1.106229305267334, + "learning_rate": 9.391277308365638e-05, + "loss": 0.0879, + "step": 36140 + }, + { + "epoch": 2.365063788027478, + "grad_norm": 0.8265957832336426, + "learning_rate": 9.390837970028553e-05, + "loss": 0.0796, + "step": 36150 + }, + { + "epoch": 2.3657180242067386, + "grad_norm": 0.8237302303314209, + "learning_rate": 9.390398483489171e-05, + "loss": 0.0843, + "step": 36160 + }, + { + "epoch": 2.366372260385999, + "grad_norm": 0.819557249546051, + "learning_rate": 9.389958848762327e-05, + "loss": 0.0811, + "step": 36170 + }, + { + "epoch": 2.36702649656526, + "grad_norm": 0.9804088473320007, + "learning_rate": 9.389519065862858e-05, + "loss": 0.0887, + "step": 36180 + }, + { + "epoch": 2.3676807327445206, + "grad_norm": 1.014117956161499, + "learning_rate": 9.389079134805609e-05, + "loss": 0.0732, + "step": 36190 + }, + { + "epoch": 2.3683349689237816, + "grad_norm": 0.8419691920280457, + "learning_rate": 9.388639055605428e-05, + "loss": 0.0909, + "step": 36200 + }, + { + "epoch": 2.368989205103042, + "grad_norm": 0.9921837449073792, + "learning_rate": 9.388198828277169e-05, + "loss": 0.0783, + "step": 36210 + }, + { + "epoch": 2.369643441282303, + "grad_norm": 0.9146479964256287, + "learning_rate": 9.387758452835692e-05, + "loss": 0.078, + "step": 36220 + }, + { + "epoch": 2.3702976774615636, + "grad_norm": 0.7768714427947998, + "learning_rate": 9.387317929295859e-05, + "loss": 0.084, + "step": 36230 + }, + { + "epoch": 2.370951913640824, + "grad_norm": 0.8944301009178162, + "learning_rate": 9.38687725767254e-05, + "loss": 0.0873, + "step": 36240 + }, + { + "epoch": 2.371606149820085, + "grad_norm": 0.7859957814216614, + "learning_rate": 9.38643643798061e-05, + "loss": 0.0849, + "step": 36250 + }, + { + "epoch": 2.3722603859993456, + "grad_norm": 0.8967961668968201, + "learning_rate": 9.385995470234944e-05, + "loss": 0.075, + "step": 36260 + }, + { + "epoch": 2.3729146221786066, + "grad_norm": 0.7501564621925354, + "learning_rate": 9.38555435445043e-05, + "loss": 0.093, + "step": 36270 + }, + { + "epoch": 2.373568858357867, + "grad_norm": 0.7810158729553223, + "learning_rate": 9.385113090641953e-05, + "loss": 0.0823, + "step": 36280 + }, + { + "epoch": 2.374223094537128, + "grad_norm": 0.8977556228637695, + "learning_rate": 9.38467167882441e-05, + "loss": 0.0823, + "step": 36290 + }, + { + "epoch": 2.3748773307163886, + "grad_norm": 0.9205357432365417, + "learning_rate": 9.384230119012698e-05, + "loss": 0.0869, + "step": 36300 + }, + { + "epoch": 2.375531566895649, + "grad_norm": 0.7959444522857666, + "learning_rate": 9.383788411221724e-05, + "loss": 0.0865, + "step": 36310 + }, + { + "epoch": 2.37618580307491, + "grad_norm": 0.7368982434272766, + "learning_rate": 9.383346555466392e-05, + "loss": 0.0879, + "step": 36320 + }, + { + "epoch": 2.3768400392541706, + "grad_norm": 1.0566291809082031, + "learning_rate": 9.382904551761618e-05, + "loss": 0.0976, + "step": 36330 + }, + { + "epoch": 2.3774942754334316, + "grad_norm": 0.7619825601577759, + "learning_rate": 9.38246240012232e-05, + "loss": 0.0847, + "step": 36340 + }, + { + "epoch": 2.378148511612692, + "grad_norm": 0.7798367738723755, + "learning_rate": 9.382020100563425e-05, + "loss": 0.0897, + "step": 36350 + }, + { + "epoch": 2.378802747791953, + "grad_norm": 0.8497381210327148, + "learning_rate": 9.381577653099858e-05, + "loss": 0.0823, + "step": 36360 + }, + { + "epoch": 2.3794569839712136, + "grad_norm": 0.8219572901725769, + "learning_rate": 9.381135057746552e-05, + "loss": 0.0889, + "step": 36370 + }, + { + "epoch": 2.380111220150474, + "grad_norm": 0.8166351914405823, + "learning_rate": 9.380692314518451e-05, + "loss": 0.0907, + "step": 36380 + }, + { + "epoch": 2.380765456329735, + "grad_norm": 1.0758252143859863, + "learning_rate": 9.380249423430494e-05, + "loss": 0.096, + "step": 36390 + }, + { + "epoch": 2.3814196925089957, + "grad_norm": 0.8256650567054749, + "learning_rate": 9.379806384497633e-05, + "loss": 0.0718, + "step": 36400 + }, + { + "epoch": 2.3820739286882566, + "grad_norm": 0.8822137117385864, + "learning_rate": 9.379363197734818e-05, + "loss": 0.095, + "step": 36410 + }, + { + "epoch": 2.382728164867517, + "grad_norm": 0.9527086615562439, + "learning_rate": 9.37891986315701e-05, + "loss": 0.0912, + "step": 36420 + }, + { + "epoch": 2.383382401046778, + "grad_norm": 0.8427805304527283, + "learning_rate": 9.378476380779174e-05, + "loss": 0.0898, + "step": 36430 + }, + { + "epoch": 2.3840366372260386, + "grad_norm": 0.8125025629997253, + "learning_rate": 9.378032750616277e-05, + "loss": 0.0719, + "step": 36440 + }, + { + "epoch": 2.384690873405299, + "grad_norm": 0.7970302700996399, + "learning_rate": 9.377588972683292e-05, + "loss": 0.0855, + "step": 36450 + }, + { + "epoch": 2.38534510958456, + "grad_norm": 0.8656693696975708, + "learning_rate": 9.377145046995198e-05, + "loss": 0.0804, + "step": 36460 + }, + { + "epoch": 2.3859993457638207, + "grad_norm": 0.7493268251419067, + "learning_rate": 9.37670097356698e-05, + "loss": 0.0845, + "step": 36470 + }, + { + "epoch": 2.3866535819430816, + "grad_norm": 0.8390613794326782, + "learning_rate": 9.376256752413626e-05, + "loss": 0.0772, + "step": 36480 + }, + { + "epoch": 2.387307818122342, + "grad_norm": 0.8210413455963135, + "learning_rate": 9.37581238355013e-05, + "loss": 0.0785, + "step": 36490 + }, + { + "epoch": 2.3879620543016027, + "grad_norm": 0.8246368765830994, + "learning_rate": 9.375367866991488e-05, + "loss": 0.0812, + "step": 36500 + }, + { + "epoch": 2.3886162904808637, + "grad_norm": 0.7549490332603455, + "learning_rate": 9.374923202752707e-05, + "loss": 0.0878, + "step": 36510 + }, + { + "epoch": 2.389270526660124, + "grad_norm": 0.8099923729896545, + "learning_rate": 9.374478390848794e-05, + "loss": 0.087, + "step": 36520 + }, + { + "epoch": 2.389924762839385, + "grad_norm": 0.8852878212928772, + "learning_rate": 9.374033431294763e-05, + "loss": 0.0897, + "step": 36530 + }, + { + "epoch": 2.3905789990186457, + "grad_norm": 0.9920669198036194, + "learning_rate": 9.373588324105634e-05, + "loss": 0.0871, + "step": 36540 + }, + { + "epoch": 2.391233235197906, + "grad_norm": 0.8708438873291016, + "learning_rate": 9.373143069296426e-05, + "loss": 0.0796, + "step": 36550 + }, + { + "epoch": 2.391887471377167, + "grad_norm": 0.751469612121582, + "learning_rate": 9.372697666882171e-05, + "loss": 0.0851, + "step": 36560 + }, + { + "epoch": 2.3925417075564277, + "grad_norm": 0.8722557425498962, + "learning_rate": 9.372252116877903e-05, + "loss": 0.0951, + "step": 36570 + }, + { + "epoch": 2.3931959437356887, + "grad_norm": 0.844551146030426, + "learning_rate": 9.371806419298659e-05, + "loss": 0.08, + "step": 36580 + }, + { + "epoch": 2.393850179914949, + "grad_norm": 0.8585530519485474, + "learning_rate": 9.371360574159483e-05, + "loss": 0.086, + "step": 36590 + }, + { + "epoch": 2.39450441609421, + "grad_norm": 0.9330928921699524, + "learning_rate": 9.370914581475423e-05, + "loss": 0.0793, + "step": 36600 + }, + { + "epoch": 2.3951586522734707, + "grad_norm": 1.0671617984771729, + "learning_rate": 9.370468441261532e-05, + "loss": 0.0737, + "step": 36610 + }, + { + "epoch": 2.395812888452731, + "grad_norm": 0.8291199803352356, + "learning_rate": 9.370022153532871e-05, + "loss": 0.0971, + "step": 36620 + }, + { + "epoch": 2.396467124631992, + "grad_norm": 1.0896260738372803, + "learning_rate": 9.3695757183045e-05, + "loss": 0.0821, + "step": 36630 + }, + { + "epoch": 2.3971213608112527, + "grad_norm": 0.7931303977966309, + "learning_rate": 9.369129135591491e-05, + "loss": 0.0857, + "step": 36640 + }, + { + "epoch": 2.3977755969905137, + "grad_norm": 0.9220706224441528, + "learning_rate": 9.368682405408912e-05, + "loss": 0.0844, + "step": 36650 + }, + { + "epoch": 2.398429833169774, + "grad_norm": 0.9693981409072876, + "learning_rate": 9.368235527771847e-05, + "loss": 0.091, + "step": 36660 + }, + { + "epoch": 2.399084069349035, + "grad_norm": 0.8742493987083435, + "learning_rate": 9.367788502695376e-05, + "loss": 0.0868, + "step": 36670 + }, + { + "epoch": 2.3997383055282957, + "grad_norm": 0.8460095524787903, + "learning_rate": 9.367341330194587e-05, + "loss": 0.0846, + "step": 36680 + }, + { + "epoch": 2.4003925417075562, + "grad_norm": 0.883783757686615, + "learning_rate": 9.366894010284576e-05, + "loss": 0.0819, + "step": 36690 + }, + { + "epoch": 2.401046777886817, + "grad_norm": 0.8907153606414795, + "learning_rate": 9.366446542980439e-05, + "loss": 0.0831, + "step": 36700 + }, + { + "epoch": 2.4017010140660777, + "grad_norm": 0.8098263144493103, + "learning_rate": 9.36599892829728e-05, + "loss": 0.0865, + "step": 36710 + }, + { + "epoch": 2.4023552502453387, + "grad_norm": 0.9012731909751892, + "learning_rate": 9.365551166250206e-05, + "loss": 0.0835, + "step": 36720 + }, + { + "epoch": 2.403009486424599, + "grad_norm": 0.817130446434021, + "learning_rate": 9.365103256854332e-05, + "loss": 0.0877, + "step": 36730 + }, + { + "epoch": 2.40366372260386, + "grad_norm": 1.0334715843200684, + "learning_rate": 9.364655200124775e-05, + "loss": 0.0905, + "step": 36740 + }, + { + "epoch": 2.4043179587831207, + "grad_norm": 0.6475959420204163, + "learning_rate": 9.364206996076659e-05, + "loss": 0.0826, + "step": 36750 + }, + { + "epoch": 2.4049721949623812, + "grad_norm": 1.0449169874191284, + "learning_rate": 9.36375864472511e-05, + "loss": 0.0842, + "step": 36760 + }, + { + "epoch": 2.405626431141642, + "grad_norm": 0.8349801898002625, + "learning_rate": 9.363310146085262e-05, + "loss": 0.089, + "step": 36770 + }, + { + "epoch": 2.4062806673209027, + "grad_norm": 0.9147889614105225, + "learning_rate": 9.362861500172255e-05, + "loss": 0.082, + "step": 36780 + }, + { + "epoch": 2.4069349035001637, + "grad_norm": 0.7441397905349731, + "learning_rate": 9.362412707001229e-05, + "loss": 0.0753, + "step": 36790 + }, + { + "epoch": 2.4075891396794242, + "grad_norm": 1.0056695938110352, + "learning_rate": 9.361963766587334e-05, + "loss": 0.08, + "step": 36800 + }, + { + "epoch": 2.408243375858685, + "grad_norm": 0.7822602987289429, + "learning_rate": 9.361514678945722e-05, + "loss": 0.0866, + "step": 36810 + }, + { + "epoch": 2.4088976120379457, + "grad_norm": 0.8155549764633179, + "learning_rate": 9.36106544409155e-05, + "loss": 0.0787, + "step": 36820 + }, + { + "epoch": 2.4095518482172062, + "grad_norm": 0.7978348135948181, + "learning_rate": 9.360616062039985e-05, + "loss": 0.0847, + "step": 36830 + }, + { + "epoch": 2.410206084396467, + "grad_norm": 0.867654025554657, + "learning_rate": 9.360166532806189e-05, + "loss": 0.0844, + "step": 36840 + }, + { + "epoch": 2.4108603205757277, + "grad_norm": 0.975135087966919, + "learning_rate": 9.359716856405339e-05, + "loss": 0.0915, + "step": 36850 + }, + { + "epoch": 2.4115145567549887, + "grad_norm": 0.7564871907234192, + "learning_rate": 9.359267032852609e-05, + "loss": 0.0968, + "step": 36860 + }, + { + "epoch": 2.4121687929342492, + "grad_norm": 0.8742430210113525, + "learning_rate": 9.358817062163188e-05, + "loss": 0.0793, + "step": 36870 + }, + { + "epoch": 2.41282302911351, + "grad_norm": 0.8599095344543457, + "learning_rate": 9.358366944352258e-05, + "loss": 0.0825, + "step": 36880 + }, + { + "epoch": 2.4134772652927707, + "grad_norm": 0.6971985101699829, + "learning_rate": 9.357916679435012e-05, + "loss": 0.0813, + "step": 36890 + }, + { + "epoch": 2.4141315014720313, + "grad_norm": 0.9535905122756958, + "learning_rate": 9.357466267426649e-05, + "loss": 0.0847, + "step": 36900 + }, + { + "epoch": 2.4147857376512922, + "grad_norm": 0.9580764174461365, + "learning_rate": 9.357015708342373e-05, + "loss": 0.0974, + "step": 36910 + }, + { + "epoch": 2.4154399738305528, + "grad_norm": 0.9554745554924011, + "learning_rate": 9.35656500219739e-05, + "loss": 0.08, + "step": 36920 + }, + { + "epoch": 2.4160942100098137, + "grad_norm": 0.7604560852050781, + "learning_rate": 9.356114149006911e-05, + "loss": 0.0854, + "step": 36930 + }, + { + "epoch": 2.4167484461890743, + "grad_norm": 0.876243531703949, + "learning_rate": 9.355663148786158e-05, + "loss": 0.0835, + "step": 36940 + }, + { + "epoch": 2.4174026823683348, + "grad_norm": 0.9993807673454285, + "learning_rate": 9.355212001550349e-05, + "loss": 0.087, + "step": 36950 + }, + { + "epoch": 2.4180569185475957, + "grad_norm": 0.934245228767395, + "learning_rate": 9.354760707314713e-05, + "loss": 0.0764, + "step": 36960 + }, + { + "epoch": 2.4187111547268563, + "grad_norm": 0.8030011057853699, + "learning_rate": 9.354309266094482e-05, + "loss": 0.0891, + "step": 36970 + }, + { + "epoch": 2.4193653909061172, + "grad_norm": 0.7659550309181213, + "learning_rate": 9.353857677904893e-05, + "loss": 0.0852, + "step": 36980 + }, + { + "epoch": 2.4200196270853778, + "grad_norm": 0.8483705520629883, + "learning_rate": 9.353405942761191e-05, + "loss": 0.0777, + "step": 36990 + }, + { + "epoch": 2.4206738632646383, + "grad_norm": 0.7422915101051331, + "learning_rate": 9.35295406067862e-05, + "loss": 0.0827, + "step": 37000 + }, + { + "epoch": 2.4213280994438993, + "grad_norm": 0.8917139172554016, + "learning_rate": 9.352502031672435e-05, + "loss": 0.0917, + "step": 37010 + }, + { + "epoch": 2.42198233562316, + "grad_norm": 0.973841667175293, + "learning_rate": 9.35204985575789e-05, + "loss": 0.0856, + "step": 37020 + }, + { + "epoch": 2.4226365718024208, + "grad_norm": 0.8658559918403625, + "learning_rate": 9.351597532950247e-05, + "loss": 0.0911, + "step": 37030 + }, + { + "epoch": 2.4232908079816813, + "grad_norm": 1.0100637674331665, + "learning_rate": 9.351145063264778e-05, + "loss": 0.0832, + "step": 37040 + }, + { + "epoch": 2.4239450441609423, + "grad_norm": 0.8611866235733032, + "learning_rate": 9.35069244671675e-05, + "loss": 0.0859, + "step": 37050 + }, + { + "epoch": 2.424599280340203, + "grad_norm": 0.8808668255805969, + "learning_rate": 9.350239683321443e-05, + "loss": 0.0876, + "step": 37060 + }, + { + "epoch": 2.4252535165194633, + "grad_norm": 0.767738938331604, + "learning_rate": 9.349786773094137e-05, + "loss": 0.0824, + "step": 37070 + }, + { + "epoch": 2.4259077526987243, + "grad_norm": 0.8980047106742859, + "learning_rate": 9.34933371605012e-05, + "loss": 0.0844, + "step": 37080 + }, + { + "epoch": 2.426561988877985, + "grad_norm": 0.6932862401008606, + "learning_rate": 9.348880512204683e-05, + "loss": 0.0765, + "step": 37090 + }, + { + "epoch": 2.4272162250572458, + "grad_norm": 0.8698898553848267, + "learning_rate": 9.348427161573124e-05, + "loss": 0.0769, + "step": 37100 + }, + { + "epoch": 2.4278704612365063, + "grad_norm": 0.8262230157852173, + "learning_rate": 9.347973664170744e-05, + "loss": 0.0876, + "step": 37110 + }, + { + "epoch": 2.4285246974157673, + "grad_norm": 0.690406322479248, + "learning_rate": 9.347520020012848e-05, + "loss": 0.086, + "step": 37120 + }, + { + "epoch": 2.429178933595028, + "grad_norm": 0.7355995178222656, + "learning_rate": 9.347066229114751e-05, + "loss": 0.0938, + "step": 37130 + }, + { + "epoch": 2.4298331697742883, + "grad_norm": 0.8815680742263794, + "learning_rate": 9.34661229149177e-05, + "loss": 0.0923, + "step": 37140 + }, + { + "epoch": 2.4304874059535493, + "grad_norm": 0.8908131122589111, + "learning_rate": 9.346158207159222e-05, + "loss": 0.0841, + "step": 37150 + }, + { + "epoch": 2.43114164213281, + "grad_norm": 0.7907306551933289, + "learning_rate": 9.345703976132438e-05, + "loss": 0.0847, + "step": 37160 + }, + { + "epoch": 2.431795878312071, + "grad_norm": 0.9056040048599243, + "learning_rate": 9.345249598426746e-05, + "loss": 0.0879, + "step": 37170 + }, + { + "epoch": 2.4324501144913313, + "grad_norm": 0.7010656595230103, + "learning_rate": 9.344795074057487e-05, + "loss": 0.0732, + "step": 37180 + }, + { + "epoch": 2.4331043506705923, + "grad_norm": 0.7615019083023071, + "learning_rate": 9.344340403039998e-05, + "loss": 0.0846, + "step": 37190 + }, + { + "epoch": 2.433758586849853, + "grad_norm": 0.9144221544265747, + "learning_rate": 9.343885585389627e-05, + "loss": 0.0982, + "step": 37200 + }, + { + "epoch": 2.4344128230291133, + "grad_norm": 1.0988235473632812, + "learning_rate": 9.343430621121724e-05, + "loss": 0.0877, + "step": 37210 + }, + { + "epoch": 2.4350670592083743, + "grad_norm": 0.8255583643913269, + "learning_rate": 9.342975510251649e-05, + "loss": 0.0832, + "step": 37220 + }, + { + "epoch": 2.435721295387635, + "grad_norm": 0.7476474642753601, + "learning_rate": 9.342520252794759e-05, + "loss": 0.0843, + "step": 37230 + }, + { + "epoch": 2.436375531566896, + "grad_norm": 0.7885528802871704, + "learning_rate": 9.342064848766423e-05, + "loss": 0.0896, + "step": 37240 + }, + { + "epoch": 2.4370297677461563, + "grad_norm": 0.8484315872192383, + "learning_rate": 9.341609298182008e-05, + "loss": 0.0814, + "step": 37250 + }, + { + "epoch": 2.4376840039254173, + "grad_norm": 1.143760085105896, + "learning_rate": 9.341153601056896e-05, + "loss": 0.0843, + "step": 37260 + }, + { + "epoch": 2.438338240104678, + "grad_norm": 0.7596016526222229, + "learning_rate": 9.340697757406462e-05, + "loss": 0.0813, + "step": 37270 + }, + { + "epoch": 2.4389924762839383, + "grad_norm": 0.9794483184814453, + "learning_rate": 9.340241767246099e-05, + "loss": 0.0755, + "step": 37280 + }, + { + "epoch": 2.4396467124631993, + "grad_norm": 0.7860924005508423, + "learning_rate": 9.33978563059119e-05, + "loss": 0.0782, + "step": 37290 + }, + { + "epoch": 2.44030094864246, + "grad_norm": 0.8889253735542297, + "learning_rate": 9.339329347457135e-05, + "loss": 0.0896, + "step": 37300 + }, + { + "epoch": 2.440955184821721, + "grad_norm": 0.6699261665344238, + "learning_rate": 9.338872917859335e-05, + "loss": 0.0847, + "step": 37310 + }, + { + "epoch": 2.4416094210009813, + "grad_norm": 0.8872549533843994, + "learning_rate": 9.338416341813196e-05, + "loss": 0.081, + "step": 37320 + }, + { + "epoch": 2.4422636571802423, + "grad_norm": 0.8817291259765625, + "learning_rate": 9.337959619334125e-05, + "loss": 0.0812, + "step": 37330 + }, + { + "epoch": 2.442917893359503, + "grad_norm": 0.9922673106193542, + "learning_rate": 9.337502750437542e-05, + "loss": 0.0871, + "step": 37340 + }, + { + "epoch": 2.4435721295387633, + "grad_norm": 0.7725539803504944, + "learning_rate": 9.337045735138865e-05, + "loss": 0.0787, + "step": 37350 + }, + { + "epoch": 2.4442263657180243, + "grad_norm": 0.9328073859214783, + "learning_rate": 9.336588573453521e-05, + "loss": 0.0799, + "step": 37360 + }, + { + "epoch": 2.444880601897285, + "grad_norm": 0.7553234696388245, + "learning_rate": 9.33613126539694e-05, + "loss": 0.0799, + "step": 37370 + }, + { + "epoch": 2.445534838076546, + "grad_norm": 0.8015692830085754, + "learning_rate": 9.335673810984553e-05, + "loss": 0.0894, + "step": 37380 + }, + { + "epoch": 2.4461890742558063, + "grad_norm": 0.7665364146232605, + "learning_rate": 9.335216210231807e-05, + "loss": 0.0815, + "step": 37390 + }, + { + "epoch": 2.446843310435067, + "grad_norm": 0.7971277236938477, + "learning_rate": 9.334758463154145e-05, + "loss": 0.0827, + "step": 37400 + }, + { + "epoch": 2.447497546614328, + "grad_norm": 0.7936835885047913, + "learning_rate": 9.334300569767016e-05, + "loss": 0.0867, + "step": 37410 + }, + { + "epoch": 2.4481517827935884, + "grad_norm": 0.9368561506271362, + "learning_rate": 9.333842530085875e-05, + "loss": 0.0768, + "step": 37420 + }, + { + "epoch": 2.4488060189728493, + "grad_norm": 0.947975218296051, + "learning_rate": 9.333384344126184e-05, + "loss": 0.0807, + "step": 37430 + }, + { + "epoch": 2.44946025515211, + "grad_norm": 0.7947817444801331, + "learning_rate": 9.332926011903405e-05, + "loss": 0.0909, + "step": 37440 + }, + { + "epoch": 2.4501144913313704, + "grad_norm": 0.8594073057174683, + "learning_rate": 9.33246753343301e-05, + "loss": 0.082, + "step": 37450 + }, + { + "epoch": 2.4507687275106314, + "grad_norm": 1.020121693611145, + "learning_rate": 9.332008908730473e-05, + "loss": 0.078, + "step": 37460 + }, + { + "epoch": 2.451422963689892, + "grad_norm": 0.7262875437736511, + "learning_rate": 9.331550137811276e-05, + "loss": 0.0789, + "step": 37470 + }, + { + "epoch": 2.452077199869153, + "grad_norm": 0.7914772033691406, + "learning_rate": 9.331091220690902e-05, + "loss": 0.083, + "step": 37480 + }, + { + "epoch": 2.4527314360484134, + "grad_norm": 0.8740646243095398, + "learning_rate": 9.330632157384838e-05, + "loss": 0.0757, + "step": 37490 + }, + { + "epoch": 2.4533856722276743, + "grad_norm": 0.9303426742553711, + "learning_rate": 9.330172947908583e-05, + "loss": 0.0835, + "step": 37500 + }, + { + "epoch": 2.454039908406935, + "grad_norm": 0.7235417366027832, + "learning_rate": 9.329713592277634e-05, + "loss": 0.0836, + "step": 37510 + }, + { + "epoch": 2.4546941445861954, + "grad_norm": 0.7460285425186157, + "learning_rate": 9.329254090507498e-05, + "loss": 0.0807, + "step": 37520 + }, + { + "epoch": 2.4553483807654564, + "grad_norm": 0.6465203166007996, + "learning_rate": 9.32879444261368e-05, + "loss": 0.0784, + "step": 37530 + }, + { + "epoch": 2.456002616944717, + "grad_norm": 0.80832839012146, + "learning_rate": 9.328334648611699e-05, + "loss": 0.0777, + "step": 37540 + }, + { + "epoch": 2.456656853123978, + "grad_norm": 0.7820184230804443, + "learning_rate": 9.32787470851707e-05, + "loss": 0.0819, + "step": 37550 + }, + { + "epoch": 2.4573110893032384, + "grad_norm": 0.934441864490509, + "learning_rate": 9.32741462234532e-05, + "loss": 0.08, + "step": 37560 + }, + { + "epoch": 2.4579653254824994, + "grad_norm": 0.9643844962120056, + "learning_rate": 9.32695439011198e-05, + "loss": 0.0737, + "step": 37570 + }, + { + "epoch": 2.45861956166176, + "grad_norm": 1.1645561456680298, + "learning_rate": 9.326494011832578e-05, + "loss": 0.0802, + "step": 37580 + }, + { + "epoch": 2.4592737978410204, + "grad_norm": 0.8493544459342957, + "learning_rate": 9.326033487522659e-05, + "loss": 0.0881, + "step": 37590 + }, + { + "epoch": 2.4599280340202814, + "grad_norm": 0.7914831042289734, + "learning_rate": 9.325572817197763e-05, + "loss": 0.088, + "step": 37600 + }, + { + "epoch": 2.460582270199542, + "grad_norm": 0.7926651239395142, + "learning_rate": 9.325112000873439e-05, + "loss": 0.0873, + "step": 37610 + }, + { + "epoch": 2.461236506378803, + "grad_norm": 1.0331966876983643, + "learning_rate": 9.324651038565244e-05, + "loss": 0.0813, + "step": 37620 + }, + { + "epoch": 2.4618907425580634, + "grad_norm": 1.0331898927688599, + "learning_rate": 9.324189930288734e-05, + "loss": 0.0896, + "step": 37630 + }, + { + "epoch": 2.4625449787373244, + "grad_norm": 0.877951443195343, + "learning_rate": 9.323728676059474e-05, + "loss": 0.0802, + "step": 37640 + }, + { + "epoch": 2.463199214916585, + "grad_norm": 0.9235687255859375, + "learning_rate": 9.32326727589303e-05, + "loss": 0.0785, + "step": 37650 + }, + { + "epoch": 2.4638534510958454, + "grad_norm": 0.8438425660133362, + "learning_rate": 9.322805729804979e-05, + "loss": 0.0837, + "step": 37660 + }, + { + "epoch": 2.4645076872751064, + "grad_norm": 0.9116701483726501, + "learning_rate": 9.322344037810898e-05, + "loss": 0.0815, + "step": 37670 + }, + { + "epoch": 2.465161923454367, + "grad_norm": 0.7631514072418213, + "learning_rate": 9.321882199926369e-05, + "loss": 0.0812, + "step": 37680 + }, + { + "epoch": 2.465816159633628, + "grad_norm": 0.6126758456230164, + "learning_rate": 9.321420216166979e-05, + "loss": 0.0787, + "step": 37690 + }, + { + "epoch": 2.4664703958128884, + "grad_norm": 0.8149334192276001, + "learning_rate": 9.320958086548326e-05, + "loss": 0.0836, + "step": 37700 + }, + { + "epoch": 2.4671246319921494, + "grad_norm": 0.9959266781806946, + "learning_rate": 9.320495811086006e-05, + "loss": 0.0878, + "step": 37710 + }, + { + "epoch": 2.46777886817141, + "grad_norm": 0.7576195597648621, + "learning_rate": 9.320033389795619e-05, + "loss": 0.0739, + "step": 37720 + }, + { + "epoch": 2.4684331043506704, + "grad_norm": 0.8124213814735413, + "learning_rate": 9.319570822692778e-05, + "loss": 0.0902, + "step": 37730 + }, + { + "epoch": 2.4690873405299314, + "grad_norm": 0.8176689743995667, + "learning_rate": 9.319108109793091e-05, + "loss": 0.0825, + "step": 37740 + }, + { + "epoch": 2.469741576709192, + "grad_norm": 0.7739967703819275, + "learning_rate": 9.318645251112179e-05, + "loss": 0.0901, + "step": 37750 + }, + { + "epoch": 2.470395812888453, + "grad_norm": 0.7479947209358215, + "learning_rate": 9.318182246665663e-05, + "loss": 0.0813, + "step": 37760 + }, + { + "epoch": 2.4710500490677134, + "grad_norm": 0.8923253417015076, + "learning_rate": 9.317719096469172e-05, + "loss": 0.0796, + "step": 37770 + }, + { + "epoch": 2.4717042852469744, + "grad_norm": 0.9640159010887146, + "learning_rate": 9.317255800538339e-05, + "loss": 0.0797, + "step": 37780 + }, + { + "epoch": 2.472358521426235, + "grad_norm": 0.9136691689491272, + "learning_rate": 9.3167923588888e-05, + "loss": 0.0888, + "step": 37790 + }, + { + "epoch": 2.4730127576054954, + "grad_norm": 0.7730658650398254, + "learning_rate": 9.316328771536195e-05, + "loss": 0.0832, + "step": 37800 + }, + { + "epoch": 2.4736669937847564, + "grad_norm": 0.7457364201545715, + "learning_rate": 9.315865038496177e-05, + "loss": 0.0875, + "step": 37810 + }, + { + "epoch": 2.474321229964017, + "grad_norm": 0.7927015423774719, + "learning_rate": 9.315401159784394e-05, + "loss": 0.0857, + "step": 37820 + }, + { + "epoch": 2.474975466143278, + "grad_norm": 0.861659586429596, + "learning_rate": 9.314937135416506e-05, + "loss": 0.079, + "step": 37830 + }, + { + "epoch": 2.4756297023225384, + "grad_norm": 0.9620111584663391, + "learning_rate": 9.31447296540817e-05, + "loss": 0.0918, + "step": 37840 + }, + { + "epoch": 2.4762839385017994, + "grad_norm": 0.7795575857162476, + "learning_rate": 9.314008649775059e-05, + "loss": 0.0752, + "step": 37850 + }, + { + "epoch": 2.47693817468106, + "grad_norm": 0.9564193487167358, + "learning_rate": 9.313544188532841e-05, + "loss": 0.0756, + "step": 37860 + }, + { + "epoch": 2.4775924108603204, + "grad_norm": 0.7260618805885315, + "learning_rate": 9.313079581697194e-05, + "loss": 0.0773, + "step": 37870 + }, + { + "epoch": 2.4782466470395814, + "grad_norm": 1.0667495727539062, + "learning_rate": 9.312614829283799e-05, + "loss": 0.0941, + "step": 37880 + }, + { + "epoch": 2.478900883218842, + "grad_norm": 0.7811123728752136, + "learning_rate": 9.312149931308345e-05, + "loss": 0.0791, + "step": 37890 + }, + { + "epoch": 2.4795551193981025, + "grad_norm": 0.8247928619384766, + "learning_rate": 9.31168488778652e-05, + "loss": 0.0873, + "step": 37900 + }, + { + "epoch": 2.4802093555773634, + "grad_norm": 0.8802565932273865, + "learning_rate": 9.311219698734024e-05, + "loss": 0.0797, + "step": 37910 + }, + { + "epoch": 2.480863591756624, + "grad_norm": 0.9362874031066895, + "learning_rate": 9.310754364166554e-05, + "loss": 0.0814, + "step": 37920 + }, + { + "epoch": 2.481517827935885, + "grad_norm": 0.9797009825706482, + "learning_rate": 9.310288884099822e-05, + "loss": 0.0892, + "step": 37930 + }, + { + "epoch": 2.4821720641151455, + "grad_norm": 0.7946538925170898, + "learning_rate": 9.309823258549535e-05, + "loss": 0.0809, + "step": 37940 + }, + { + "epoch": 2.4828263002944064, + "grad_norm": 0.8862201571464539, + "learning_rate": 9.30935748753141e-05, + "loss": 0.084, + "step": 37950 + }, + { + "epoch": 2.483480536473667, + "grad_norm": 0.8820653557777405, + "learning_rate": 9.308891571061167e-05, + "loss": 0.0921, + "step": 37960 + }, + { + "epoch": 2.4841347726529275, + "grad_norm": 0.8164055347442627, + "learning_rate": 9.308425509154533e-05, + "loss": 0.0819, + "step": 37970 + }, + { + "epoch": 2.4847890088321885, + "grad_norm": 0.8694265484809875, + "learning_rate": 9.307959301827241e-05, + "loss": 0.0782, + "step": 37980 + }, + { + "epoch": 2.485443245011449, + "grad_norm": 1.1511107683181763, + "learning_rate": 9.307492949095021e-05, + "loss": 0.0913, + "step": 37990 + }, + { + "epoch": 2.48609748119071, + "grad_norm": 0.8150752186775208, + "learning_rate": 9.307026450973619e-05, + "loss": 0.0899, + "step": 38000 + }, + { + "epoch": 2.4867517173699705, + "grad_norm": 0.9316548109054565, + "learning_rate": 9.306559807478779e-05, + "loss": 0.0798, + "step": 38010 + }, + { + "epoch": 2.4874059535492314, + "grad_norm": 0.9588937759399414, + "learning_rate": 9.306093018626252e-05, + "loss": 0.082, + "step": 38020 + }, + { + "epoch": 2.488060189728492, + "grad_norm": 1.0078554153442383, + "learning_rate": 9.30562608443179e-05, + "loss": 0.078, + "step": 38030 + }, + { + "epoch": 2.4887144259077525, + "grad_norm": 1.0711711645126343, + "learning_rate": 9.30515900491116e-05, + "loss": 0.0909, + "step": 38040 + }, + { + "epoch": 2.4893686620870135, + "grad_norm": 0.867173433303833, + "learning_rate": 9.30469178008012e-05, + "loss": 0.0822, + "step": 38050 + }, + { + "epoch": 2.490022898266274, + "grad_norm": 0.7762174606323242, + "learning_rate": 9.304224409954442e-05, + "loss": 0.0857, + "step": 38060 + }, + { + "epoch": 2.490677134445535, + "grad_norm": 1.0067529678344727, + "learning_rate": 9.303756894549903e-05, + "loss": 0.0851, + "step": 38070 + }, + { + "epoch": 2.4913313706247955, + "grad_norm": 0.9543056488037109, + "learning_rate": 9.303289233882281e-05, + "loss": 0.0754, + "step": 38080 + }, + { + "epoch": 2.4919856068040565, + "grad_norm": 1.024979591369629, + "learning_rate": 9.302821427967363e-05, + "loss": 0.0859, + "step": 38090 + }, + { + "epoch": 2.492639842983317, + "grad_norm": 0.7905625700950623, + "learning_rate": 9.302353476820936e-05, + "loss": 0.0789, + "step": 38100 + }, + { + "epoch": 2.4932940791625775, + "grad_norm": 0.8276375532150269, + "learning_rate": 9.301885380458797e-05, + "loss": 0.0807, + "step": 38110 + }, + { + "epoch": 2.4939483153418385, + "grad_norm": 0.7747846245765686, + "learning_rate": 9.301417138896743e-05, + "loss": 0.0798, + "step": 38120 + }, + { + "epoch": 2.494602551521099, + "grad_norm": 0.9668558835983276, + "learning_rate": 9.30094875215058e-05, + "loss": 0.0816, + "step": 38130 + }, + { + "epoch": 2.49525678770036, + "grad_norm": 1.070252537727356, + "learning_rate": 9.300480220236119e-05, + "loss": 0.0888, + "step": 38140 + }, + { + "epoch": 2.4959110238796205, + "grad_norm": 0.9471240043640137, + "learning_rate": 9.30001154316917e-05, + "loss": 0.0879, + "step": 38150 + }, + { + "epoch": 2.4965652600588815, + "grad_norm": 0.7727608680725098, + "learning_rate": 9.299542720965554e-05, + "loss": 0.0879, + "step": 38160 + }, + { + "epoch": 2.497219496238142, + "grad_norm": 1.2870181798934937, + "learning_rate": 9.299073753641096e-05, + "loss": 0.0907, + "step": 38170 + }, + { + "epoch": 2.4978737324174025, + "grad_norm": 1.0817999839782715, + "learning_rate": 9.298604641211624e-05, + "loss": 0.1045, + "step": 38180 + }, + { + "epoch": 2.4985279685966635, + "grad_norm": 0.7632635831832886, + "learning_rate": 9.298135383692972e-05, + "loss": 0.0793, + "step": 38190 + }, + { + "epoch": 2.499182204775924, + "grad_norm": 0.9093852639198303, + "learning_rate": 9.297665981100978e-05, + "loss": 0.0774, + "step": 38200 + }, + { + "epoch": 2.499836440955185, + "grad_norm": 1.0639636516571045, + "learning_rate": 9.297196433451487e-05, + "loss": 0.0728, + "step": 38210 + }, + { + "epoch": 2.5004906771344455, + "grad_norm": 0.9197503924369812, + "learning_rate": 9.296726740760346e-05, + "loss": 0.0849, + "step": 38220 + }, + { + "epoch": 2.5011449133137065, + "grad_norm": 0.8370291590690613, + "learning_rate": 9.296256903043408e-05, + "loss": 0.0764, + "step": 38230 + }, + { + "epoch": 2.501799149492967, + "grad_norm": 0.7800946831703186, + "learning_rate": 9.295786920316533e-05, + "loss": 0.0802, + "step": 38240 + }, + { + "epoch": 2.5024533856722275, + "grad_norm": 1.0258721113204956, + "learning_rate": 9.295316792595586e-05, + "loss": 0.0874, + "step": 38250 + }, + { + "epoch": 2.5031076218514885, + "grad_norm": 0.7372403144836426, + "learning_rate": 9.294846519896429e-05, + "loss": 0.0774, + "step": 38260 + }, + { + "epoch": 2.503761858030749, + "grad_norm": 0.8242309093475342, + "learning_rate": 9.294376102234938e-05, + "loss": 0.0814, + "step": 38270 + }, + { + "epoch": 2.5044160942100095, + "grad_norm": 0.8311209678649902, + "learning_rate": 9.293905539626993e-05, + "loss": 0.079, + "step": 38280 + }, + { + "epoch": 2.5050703303892705, + "grad_norm": 0.7952736616134644, + "learning_rate": 9.293434832088475e-05, + "loss": 0.0779, + "step": 38290 + }, + { + "epoch": 2.5057245665685315, + "grad_norm": 0.9289050102233887, + "learning_rate": 9.29296397963527e-05, + "loss": 0.0824, + "step": 38300 + }, + { + "epoch": 2.506378802747792, + "grad_norm": 0.9673059582710266, + "learning_rate": 9.292492982283272e-05, + "loss": 0.0874, + "step": 38310 + }, + { + "epoch": 2.5070330389270525, + "grad_norm": 0.8958445191383362, + "learning_rate": 9.29202184004838e-05, + "loss": 0.0877, + "step": 38320 + }, + { + "epoch": 2.5076872751063135, + "grad_norm": 0.8611701726913452, + "learning_rate": 9.291550552946493e-05, + "loss": 0.0834, + "step": 38330 + }, + { + "epoch": 2.508341511285574, + "grad_norm": 0.7747957706451416, + "learning_rate": 9.29107912099352e-05, + "loss": 0.077, + "step": 38340 + }, + { + "epoch": 2.5089957474648346, + "grad_norm": 1.0127016305923462, + "learning_rate": 9.290607544205374e-05, + "loss": 0.0961, + "step": 38350 + }, + { + "epoch": 2.5096499836440955, + "grad_norm": 0.829646110534668, + "learning_rate": 9.290135822597969e-05, + "loss": 0.0863, + "step": 38360 + }, + { + "epoch": 2.5103042198233565, + "grad_norm": 0.9763094782829285, + "learning_rate": 9.28966395618723e-05, + "loss": 0.08, + "step": 38370 + }, + { + "epoch": 2.510958456002617, + "grad_norm": 0.8551107048988342, + "learning_rate": 9.289191944989083e-05, + "loss": 0.079, + "step": 38380 + }, + { + "epoch": 2.5116126921818775, + "grad_norm": 0.9324933886528015, + "learning_rate": 9.288719789019458e-05, + "loss": 0.0947, + "step": 38390 + }, + { + "epoch": 2.5122669283611385, + "grad_norm": 0.8589342832565308, + "learning_rate": 9.288247488294293e-05, + "loss": 0.0805, + "step": 38400 + }, + { + "epoch": 2.512921164540399, + "grad_norm": 0.7855199575424194, + "learning_rate": 9.28777504282953e-05, + "loss": 0.0841, + "step": 38410 + }, + { + "epoch": 2.5135754007196596, + "grad_norm": 0.9341410994529724, + "learning_rate": 9.287302452641112e-05, + "loss": 0.0938, + "step": 38420 + }, + { + "epoch": 2.5142296368989205, + "grad_norm": 1.012837290763855, + "learning_rate": 9.286829717744993e-05, + "loss": 0.076, + "step": 38430 + }, + { + "epoch": 2.514883873078181, + "grad_norm": 0.8277202844619751, + "learning_rate": 9.286356838157128e-05, + "loss": 0.0847, + "step": 38440 + }, + { + "epoch": 2.515538109257442, + "grad_norm": 0.7765419483184814, + "learning_rate": 9.28588381389348e-05, + "loss": 0.0896, + "step": 38450 + }, + { + "epoch": 2.5161923454367026, + "grad_norm": 1.0105395317077637, + "learning_rate": 9.285410644970013e-05, + "loss": 0.0918, + "step": 38460 + }, + { + "epoch": 2.5168465816159635, + "grad_norm": 0.7408658266067505, + "learning_rate": 9.284937331402697e-05, + "loss": 0.0808, + "step": 38470 + }, + { + "epoch": 2.517500817795224, + "grad_norm": 0.782345712184906, + "learning_rate": 9.284463873207508e-05, + "loss": 0.0748, + "step": 38480 + }, + { + "epoch": 2.5181550539744846, + "grad_norm": 0.7079125046730042, + "learning_rate": 9.283990270400428e-05, + "loss": 0.0831, + "step": 38490 + }, + { + "epoch": 2.5188092901537456, + "grad_norm": 0.920768678188324, + "learning_rate": 9.28351652299744e-05, + "loss": 0.0907, + "step": 38500 + }, + { + "epoch": 2.519463526333006, + "grad_norm": 0.9361595511436462, + "learning_rate": 9.283042631014535e-05, + "loss": 0.0848, + "step": 38510 + }, + { + "epoch": 2.520117762512267, + "grad_norm": 0.9975919723510742, + "learning_rate": 9.282568594467711e-05, + "loss": 0.0841, + "step": 38520 + }, + { + "epoch": 2.5207719986915276, + "grad_norm": 0.7760083675384521, + "learning_rate": 9.282094413372963e-05, + "loss": 0.0785, + "step": 38530 + }, + { + "epoch": 2.5214262348707885, + "grad_norm": 0.8321374654769897, + "learning_rate": 9.2816200877463e-05, + "loss": 0.0842, + "step": 38540 + }, + { + "epoch": 2.522080471050049, + "grad_norm": 1.0121804475784302, + "learning_rate": 9.28114561760373e-05, + "loss": 0.08, + "step": 38550 + }, + { + "epoch": 2.5227347072293096, + "grad_norm": 0.8037693500518799, + "learning_rate": 9.280671002961267e-05, + "loss": 0.0767, + "step": 38560 + }, + { + "epoch": 2.5233889434085706, + "grad_norm": 0.9284687042236328, + "learning_rate": 9.280196243834931e-05, + "loss": 0.0862, + "step": 38570 + }, + { + "epoch": 2.524043179587831, + "grad_norm": 0.7956443428993225, + "learning_rate": 9.279721340240745e-05, + "loss": 0.0783, + "step": 38580 + }, + { + "epoch": 2.524697415767092, + "grad_norm": 0.7840268015861511, + "learning_rate": 9.279246292194743e-05, + "loss": 0.0842, + "step": 38590 + }, + { + "epoch": 2.5253516519463526, + "grad_norm": 0.9286373257637024, + "learning_rate": 9.278771099712956e-05, + "loss": 0.0921, + "step": 38600 + }, + { + "epoch": 2.5260058881256136, + "grad_norm": 0.8080535531044006, + "learning_rate": 9.27829576281142e-05, + "loss": 0.0879, + "step": 38610 + }, + { + "epoch": 2.526660124304874, + "grad_norm": 0.9005104899406433, + "learning_rate": 9.277820281506184e-05, + "loss": 0.0819, + "step": 38620 + }, + { + "epoch": 2.5273143604841346, + "grad_norm": 0.7505617141723633, + "learning_rate": 9.277344655813292e-05, + "loss": 0.0816, + "step": 38630 + }, + { + "epoch": 2.5279685966633956, + "grad_norm": 0.6954260468482971, + "learning_rate": 9.276868885748802e-05, + "loss": 0.0822, + "step": 38640 + }, + { + "epoch": 2.528622832842656, + "grad_norm": 0.7404685616493225, + "learning_rate": 9.276392971328771e-05, + "loss": 0.071, + "step": 38650 + }, + { + "epoch": 2.529277069021917, + "grad_norm": 0.9218646287918091, + "learning_rate": 9.27591691256926e-05, + "loss": 0.0841, + "step": 38660 + }, + { + "epoch": 2.5299313052011776, + "grad_norm": 0.894137978553772, + "learning_rate": 9.275440709486342e-05, + "loss": 0.0914, + "step": 38670 + }, + { + "epoch": 2.5305855413804386, + "grad_norm": 0.7454158067703247, + "learning_rate": 9.274964362096085e-05, + "loss": 0.0858, + "step": 38680 + }, + { + "epoch": 2.531239777559699, + "grad_norm": 0.8861522674560547, + "learning_rate": 9.274487870414569e-05, + "loss": 0.0809, + "step": 38690 + }, + { + "epoch": 2.5318940137389596, + "grad_norm": 0.653781533241272, + "learning_rate": 9.27401123445788e-05, + "loss": 0.0796, + "step": 38700 + }, + { + "epoch": 2.5325482499182206, + "grad_norm": 0.8634945154190063, + "learning_rate": 9.273534454242101e-05, + "loss": 0.0744, + "step": 38710 + }, + { + "epoch": 2.533202486097481, + "grad_norm": 0.7516806125640869, + "learning_rate": 9.273057529783327e-05, + "loss": 0.0827, + "step": 38720 + }, + { + "epoch": 2.5338567222767416, + "grad_norm": 0.7837287783622742, + "learning_rate": 9.272580461097654e-05, + "loss": 0.0785, + "step": 38730 + }, + { + "epoch": 2.5345109584560026, + "grad_norm": 0.8319099545478821, + "learning_rate": 9.272103248201185e-05, + "loss": 0.0801, + "step": 38740 + }, + { + "epoch": 2.5351651946352636, + "grad_norm": 0.9561896324157715, + "learning_rate": 9.271625891110028e-05, + "loss": 0.0834, + "step": 38750 + }, + { + "epoch": 2.535819430814524, + "grad_norm": 0.8369835615158081, + "learning_rate": 9.271148389840294e-05, + "loss": 0.0829, + "step": 38760 + }, + { + "epoch": 2.5364736669937846, + "grad_norm": 0.8404773473739624, + "learning_rate": 9.270670744408101e-05, + "loss": 0.0795, + "step": 38770 + }, + { + "epoch": 2.5371279031730456, + "grad_norm": 0.7207024693489075, + "learning_rate": 9.270192954829571e-05, + "loss": 0.0775, + "step": 38780 + }, + { + "epoch": 2.537782139352306, + "grad_norm": 0.7661088109016418, + "learning_rate": 9.269715021120827e-05, + "loss": 0.0861, + "step": 38790 + }, + { + "epoch": 2.5384363755315666, + "grad_norm": 1.0687098503112793, + "learning_rate": 9.269236943298006e-05, + "loss": 0.0865, + "step": 38800 + }, + { + "epoch": 2.5390906117108276, + "grad_norm": 0.8831005096435547, + "learning_rate": 9.26875872137724e-05, + "loss": 0.0743, + "step": 38810 + }, + { + "epoch": 2.5397448478900886, + "grad_norm": 0.9108031988143921, + "learning_rate": 9.268280355374673e-05, + "loss": 0.0824, + "step": 38820 + }, + { + "epoch": 2.540399084069349, + "grad_norm": 0.8143110871315002, + "learning_rate": 9.26780184530645e-05, + "loss": 0.0764, + "step": 38830 + }, + { + "epoch": 2.5410533202486096, + "grad_norm": 0.7931360006332397, + "learning_rate": 9.267323191188721e-05, + "loss": 0.0811, + "step": 38840 + }, + { + "epoch": 2.5417075564278706, + "grad_norm": 0.9138084053993225, + "learning_rate": 9.266844393037644e-05, + "loss": 0.0804, + "step": 38850 + }, + { + "epoch": 2.542361792607131, + "grad_norm": 0.8782002329826355, + "learning_rate": 9.266365450869376e-05, + "loss": 0.081, + "step": 38860 + }, + { + "epoch": 2.5430160287863917, + "grad_norm": 0.8105145692825317, + "learning_rate": 9.265886364700089e-05, + "loss": 0.0772, + "step": 38870 + }, + { + "epoch": 2.5436702649656526, + "grad_norm": 1.0995359420776367, + "learning_rate": 9.265407134545947e-05, + "loss": 0.0805, + "step": 38880 + }, + { + "epoch": 2.544324501144913, + "grad_norm": 0.8194625377655029, + "learning_rate": 9.264927760423128e-05, + "loss": 0.0827, + "step": 38890 + }, + { + "epoch": 2.544978737324174, + "grad_norm": 0.9476547837257385, + "learning_rate": 9.264448242347812e-05, + "loss": 0.0885, + "step": 38900 + }, + { + "epoch": 2.5456329735034346, + "grad_norm": 0.9267840385437012, + "learning_rate": 9.263968580336185e-05, + "loss": 0.0826, + "step": 38910 + }, + { + "epoch": 2.5462872096826956, + "grad_norm": 0.8388037085533142, + "learning_rate": 9.263488774404434e-05, + "loss": 0.0834, + "step": 38920 + }, + { + "epoch": 2.546941445861956, + "grad_norm": 1.0958263874053955, + "learning_rate": 9.263008824568756e-05, + "loss": 0.0903, + "step": 38930 + }, + { + "epoch": 2.5475956820412167, + "grad_norm": 0.7269095778465271, + "learning_rate": 9.26252873084535e-05, + "loss": 0.0842, + "step": 38940 + }, + { + "epoch": 2.5482499182204776, + "grad_norm": 0.782738447189331, + "learning_rate": 9.262048493250422e-05, + "loss": 0.0848, + "step": 38950 + }, + { + "epoch": 2.548904154399738, + "grad_norm": 0.8937769532203674, + "learning_rate": 9.261568111800177e-05, + "loss": 0.0772, + "step": 38960 + }, + { + "epoch": 2.549558390578999, + "grad_norm": 0.7501261234283447, + "learning_rate": 9.261087586510834e-05, + "loss": 0.0836, + "step": 38970 + }, + { + "epoch": 2.5502126267582597, + "grad_norm": 0.8389232754707336, + "learning_rate": 9.260606917398609e-05, + "loss": 0.0939, + "step": 38980 + }, + { + "epoch": 2.5508668629375206, + "grad_norm": 0.7912153005599976, + "learning_rate": 9.260126104479727e-05, + "loss": 0.093, + "step": 38990 + }, + { + "epoch": 2.551521099116781, + "grad_norm": 0.8847309947013855, + "learning_rate": 9.259645147770415e-05, + "loss": 0.0745, + "step": 39000 + }, + { + "epoch": 2.5521753352960417, + "grad_norm": 0.7888216376304626, + "learning_rate": 9.25916404728691e-05, + "loss": 0.0829, + "step": 39010 + }, + { + "epoch": 2.5528295714753027, + "grad_norm": 1.0451523065567017, + "learning_rate": 9.258682803045448e-05, + "loss": 0.0973, + "step": 39020 + }, + { + "epoch": 2.553483807654563, + "grad_norm": 0.8907498717308044, + "learning_rate": 9.25820141506227e-05, + "loss": 0.0958, + "step": 39030 + }, + { + "epoch": 2.554138043833824, + "grad_norm": 0.9830831289291382, + "learning_rate": 9.257719883353631e-05, + "loss": 0.0944, + "step": 39040 + }, + { + "epoch": 2.5547922800130847, + "grad_norm": 0.8762632608413696, + "learning_rate": 9.257238207935777e-05, + "loss": 0.0822, + "step": 39050 + }, + { + "epoch": 2.5554465161923456, + "grad_norm": 0.8764397501945496, + "learning_rate": 9.256756388824968e-05, + "loss": 0.0811, + "step": 39060 + }, + { + "epoch": 2.556100752371606, + "grad_norm": 0.8016869425773621, + "learning_rate": 9.256274426037468e-05, + "loss": 0.0797, + "step": 39070 + }, + { + "epoch": 2.5567549885508667, + "grad_norm": 0.9790674448013306, + "learning_rate": 9.255792319589544e-05, + "loss": 0.0853, + "step": 39080 + }, + { + "epoch": 2.5574092247301277, + "grad_norm": 0.8641859889030457, + "learning_rate": 9.255310069497468e-05, + "loss": 0.0833, + "step": 39090 + }, + { + "epoch": 2.558063460909388, + "grad_norm": 0.8329319357872009, + "learning_rate": 9.254827675777517e-05, + "loss": 0.0812, + "step": 39100 + }, + { + "epoch": 2.558717697088649, + "grad_norm": 0.9464491009712219, + "learning_rate": 9.254345138445973e-05, + "loss": 0.0826, + "step": 39110 + }, + { + "epoch": 2.5593719332679097, + "grad_norm": 0.8477292656898499, + "learning_rate": 9.253862457519122e-05, + "loss": 0.0755, + "step": 39120 + }, + { + "epoch": 2.5600261694471707, + "grad_norm": 1.1739881038665771, + "learning_rate": 9.253379633013259e-05, + "loss": 0.0791, + "step": 39130 + }, + { + "epoch": 2.560680405626431, + "grad_norm": 0.8935783505439758, + "learning_rate": 9.252896664944677e-05, + "loss": 0.0834, + "step": 39140 + }, + { + "epoch": 2.5613346418056917, + "grad_norm": 0.8971335291862488, + "learning_rate": 9.25241355332968e-05, + "loss": 0.0781, + "step": 39150 + }, + { + "epoch": 2.5619888779849527, + "grad_norm": 0.8568238019943237, + "learning_rate": 9.251930298184572e-05, + "loss": 0.0795, + "step": 39160 + }, + { + "epoch": 2.562643114164213, + "grad_norm": 0.8689826726913452, + "learning_rate": 9.251446899525667e-05, + "loss": 0.0755, + "step": 39170 + }, + { + "epoch": 2.5632973503434737, + "grad_norm": 0.8828352093696594, + "learning_rate": 9.250963357369278e-05, + "loss": 0.0883, + "step": 39180 + }, + { + "epoch": 2.5639515865227347, + "grad_norm": 0.7668461799621582, + "learning_rate": 9.250479671731726e-05, + "loss": 0.0745, + "step": 39190 + }, + { + "epoch": 2.5646058227019957, + "grad_norm": 0.7046298980712891, + "learning_rate": 9.24999584262934e-05, + "loss": 0.0733, + "step": 39200 + }, + { + "epoch": 2.565260058881256, + "grad_norm": 0.77901691198349, + "learning_rate": 9.249511870078449e-05, + "loss": 0.0894, + "step": 39210 + }, + { + "epoch": 2.5659142950605167, + "grad_norm": 0.7388069033622742, + "learning_rate": 9.249027754095385e-05, + "loss": 0.0842, + "step": 39220 + }, + { + "epoch": 2.5665685312397777, + "grad_norm": 0.841890275478363, + "learning_rate": 9.248543494696494e-05, + "loss": 0.0827, + "step": 39230 + }, + { + "epoch": 2.567222767419038, + "grad_norm": 0.9349285364151001, + "learning_rate": 9.248059091898114e-05, + "loss": 0.0818, + "step": 39240 + }, + { + "epoch": 2.5678770035982987, + "grad_norm": 0.9265575408935547, + "learning_rate": 9.2475745457166e-05, + "loss": 0.0782, + "step": 39250 + }, + { + "epoch": 2.5685312397775597, + "grad_norm": 0.9001551270484924, + "learning_rate": 9.247089856168307e-05, + "loss": 0.0785, + "step": 39260 + }, + { + "epoch": 2.5691854759568207, + "grad_norm": 0.8538965582847595, + "learning_rate": 9.246605023269592e-05, + "loss": 0.0813, + "step": 39270 + }, + { + "epoch": 2.569839712136081, + "grad_norm": 0.9209931492805481, + "learning_rate": 9.24612004703682e-05, + "loss": 0.0824, + "step": 39280 + }, + { + "epoch": 2.5704939483153417, + "grad_norm": 0.6929624676704407, + "learning_rate": 9.245634927486361e-05, + "loss": 0.0781, + "step": 39290 + }, + { + "epoch": 2.5711481844946027, + "grad_norm": 0.7202306985855103, + "learning_rate": 9.245149664634589e-05, + "loss": 0.0899, + "step": 39300 + }, + { + "epoch": 2.571802420673863, + "grad_norm": 1.034266710281372, + "learning_rate": 9.244664258497881e-05, + "loss": 0.0793, + "step": 39310 + }, + { + "epoch": 2.5724566568531237, + "grad_norm": 0.7991123795509338, + "learning_rate": 9.244178709092624e-05, + "loss": 0.0764, + "step": 39320 + }, + { + "epoch": 2.5731108930323847, + "grad_norm": 0.7285871505737305, + "learning_rate": 9.243693016435204e-05, + "loss": 0.0873, + "step": 39330 + }, + { + "epoch": 2.5737651292116452, + "grad_norm": 0.7833579182624817, + "learning_rate": 9.243207180542016e-05, + "loss": 0.0819, + "step": 39340 + }, + { + "epoch": 2.574419365390906, + "grad_norm": 1.0560925006866455, + "learning_rate": 9.242721201429456e-05, + "loss": 0.0778, + "step": 39350 + }, + { + "epoch": 2.5750736015701667, + "grad_norm": 0.9079683423042297, + "learning_rate": 9.242235079113928e-05, + "loss": 0.0715, + "step": 39360 + }, + { + "epoch": 2.5757278377494277, + "grad_norm": 1.1182979345321655, + "learning_rate": 9.241748813611842e-05, + "loss": 0.0802, + "step": 39370 + }, + { + "epoch": 2.5763820739286882, + "grad_norm": 0.8554732799530029, + "learning_rate": 9.241262404939608e-05, + "loss": 0.0885, + "step": 39380 + }, + { + "epoch": 2.5770363101079488, + "grad_norm": 0.9766889214515686, + "learning_rate": 9.240775853113646e-05, + "loss": 0.0796, + "step": 39390 + }, + { + "epoch": 2.5776905462872097, + "grad_norm": 0.7499867081642151, + "learning_rate": 9.240289158150378e-05, + "loss": 0.0782, + "step": 39400 + }, + { + "epoch": 2.5783447824664703, + "grad_norm": 0.8955393433570862, + "learning_rate": 9.239802320066228e-05, + "loss": 0.0791, + "step": 39410 + }, + { + "epoch": 2.5789990186457312, + "grad_norm": 0.8181598782539368, + "learning_rate": 9.239315338877631e-05, + "loss": 0.0776, + "step": 39420 + }, + { + "epoch": 2.5796532548249917, + "grad_norm": 1.1022766828536987, + "learning_rate": 9.238828214601023e-05, + "loss": 0.0855, + "step": 39430 + }, + { + "epoch": 2.5803074910042527, + "grad_norm": 0.7724325656890869, + "learning_rate": 9.238340947252847e-05, + "loss": 0.0685, + "step": 39440 + }, + { + "epoch": 2.5809617271835132, + "grad_norm": 0.8263006806373596, + "learning_rate": 9.237853536849548e-05, + "loss": 0.0796, + "step": 39450 + }, + { + "epoch": 2.5816159633627738, + "grad_norm": 0.7736819386482239, + "learning_rate": 9.237365983407578e-05, + "loss": 0.0894, + "step": 39460 + }, + { + "epoch": 2.5822701995420347, + "grad_norm": 0.9027025699615479, + "learning_rate": 9.236878286943393e-05, + "loss": 0.0799, + "step": 39470 + }, + { + "epoch": 2.5829244357212953, + "grad_norm": 0.7817458510398865, + "learning_rate": 9.236390447473455e-05, + "loss": 0.073, + "step": 39480 + }, + { + "epoch": 2.5835786719005562, + "grad_norm": 1.0722339153289795, + "learning_rate": 9.23590246501423e-05, + "loss": 0.0872, + "step": 39490 + }, + { + "epoch": 2.5842329080798168, + "grad_norm": 1.1376055479049683, + "learning_rate": 9.235414339582185e-05, + "loss": 0.0818, + "step": 39500 + }, + { + "epoch": 2.5848871442590777, + "grad_norm": 0.907859206199646, + "learning_rate": 9.234926071193799e-05, + "loss": 0.0813, + "step": 39510 + }, + { + "epoch": 2.5855413804383383, + "grad_norm": 0.9271683096885681, + "learning_rate": 9.234437659865554e-05, + "loss": 0.0776, + "step": 39520 + }, + { + "epoch": 2.586195616617599, + "grad_norm": 1.029358148574829, + "learning_rate": 9.23394910561393e-05, + "loss": 0.0809, + "step": 39530 + }, + { + "epoch": 2.5868498527968598, + "grad_norm": 0.8963791131973267, + "learning_rate": 9.233460408455422e-05, + "loss": 0.0724, + "step": 39540 + }, + { + "epoch": 2.5875040889761203, + "grad_norm": 0.7677753567695618, + "learning_rate": 9.232971568406518e-05, + "loss": 0.0768, + "step": 39550 + }, + { + "epoch": 2.5881583251553812, + "grad_norm": 0.8181271553039551, + "learning_rate": 9.232482585483725e-05, + "loss": 0.0746, + "step": 39560 + }, + { + "epoch": 2.5888125613346418, + "grad_norm": 0.853274405002594, + "learning_rate": 9.231993459703547e-05, + "loss": 0.0705, + "step": 39570 + }, + { + "epoch": 2.5894667975139027, + "grad_norm": 0.9550169110298157, + "learning_rate": 9.23150419108249e-05, + "loss": 0.0884, + "step": 39580 + }, + { + "epoch": 2.5901210336931633, + "grad_norm": 0.9100688099861145, + "learning_rate": 9.231014779637067e-05, + "loss": 0.0702, + "step": 39590 + }, + { + "epoch": 2.590775269872424, + "grad_norm": 1.1120498180389404, + "learning_rate": 9.2305252253838e-05, + "loss": 0.0797, + "step": 39600 + }, + { + "epoch": 2.5914295060516848, + "grad_norm": 0.8531731963157654, + "learning_rate": 9.230035528339211e-05, + "loss": 0.0931, + "step": 39610 + }, + { + "epoch": 2.5920837422309453, + "grad_norm": 0.7823407649993896, + "learning_rate": 9.229545688519829e-05, + "loss": 0.0795, + "step": 39620 + }, + { + "epoch": 2.592737978410206, + "grad_norm": 0.9250206351280212, + "learning_rate": 9.229055705942189e-05, + "loss": 0.0858, + "step": 39630 + }, + { + "epoch": 2.593392214589467, + "grad_norm": 0.7410492897033691, + "learning_rate": 9.228565580622828e-05, + "loss": 0.0803, + "step": 39640 + }, + { + "epoch": 2.5940464507687278, + "grad_norm": 0.6830748915672302, + "learning_rate": 9.228075312578288e-05, + "loss": 0.0849, + "step": 39650 + }, + { + "epoch": 2.5947006869479883, + "grad_norm": 1.0575082302093506, + "learning_rate": 9.227584901825116e-05, + "loss": 0.0926, + "step": 39660 + }, + { + "epoch": 2.595354923127249, + "grad_norm": 0.950671911239624, + "learning_rate": 9.22709434837987e-05, + "loss": 0.079, + "step": 39670 + }, + { + "epoch": 2.5960091593065098, + "grad_norm": 0.7701423168182373, + "learning_rate": 9.226603652259102e-05, + "loss": 0.0928, + "step": 39680 + }, + { + "epoch": 2.5966633954857703, + "grad_norm": 0.9989359378814697, + "learning_rate": 9.226112813479377e-05, + "loss": 0.0818, + "step": 39690 + }, + { + "epoch": 2.597317631665031, + "grad_norm": 0.8732580542564392, + "learning_rate": 9.225621832057259e-05, + "loss": 0.0892, + "step": 39700 + }, + { + "epoch": 2.597971867844292, + "grad_norm": 0.894294261932373, + "learning_rate": 9.225130708009323e-05, + "loss": 0.0831, + "step": 39710 + }, + { + "epoch": 2.5986261040235528, + "grad_norm": 0.771438479423523, + "learning_rate": 9.224639441352145e-05, + "loss": 0.0812, + "step": 39720 + }, + { + "epoch": 2.5992803402028133, + "grad_norm": 1.0812944173812866, + "learning_rate": 9.224148032102307e-05, + "loss": 0.0876, + "step": 39730 + }, + { + "epoch": 2.599934576382074, + "grad_norm": 0.8493326902389526, + "learning_rate": 9.223656480276394e-05, + "loss": 0.0823, + "step": 39740 + }, + { + "epoch": 2.600588812561335, + "grad_norm": 0.8835618495941162, + "learning_rate": 9.223164785890997e-05, + "loss": 0.0766, + "step": 39750 + }, + { + "epoch": 2.6012430487405953, + "grad_norm": 0.8064636588096619, + "learning_rate": 9.222672948962713e-05, + "loss": 0.0913, + "step": 39760 + }, + { + "epoch": 2.601897284919856, + "grad_norm": 1.0927833318710327, + "learning_rate": 9.222180969508145e-05, + "loss": 0.0847, + "step": 39770 + }, + { + "epoch": 2.602551521099117, + "grad_norm": 0.9413180351257324, + "learning_rate": 9.221688847543894e-05, + "loss": 0.0852, + "step": 39780 + }, + { + "epoch": 2.6032057572783773, + "grad_norm": 0.8473165035247803, + "learning_rate": 9.221196583086573e-05, + "loss": 0.0854, + "step": 39790 + }, + { + "epoch": 2.6038599934576383, + "grad_norm": 0.8541494011878967, + "learning_rate": 9.220704176152797e-05, + "loss": 0.0803, + "step": 39800 + }, + { + "epoch": 2.604514229636899, + "grad_norm": 0.7438517808914185, + "learning_rate": 9.220211626759185e-05, + "loss": 0.0776, + "step": 39810 + }, + { + "epoch": 2.60516846581616, + "grad_norm": 0.6814586520195007, + "learning_rate": 9.219718934922364e-05, + "loss": 0.0854, + "step": 39820 + }, + { + "epoch": 2.6058227019954203, + "grad_norm": 0.7921211123466492, + "learning_rate": 9.219226100658962e-05, + "loss": 0.0748, + "step": 39830 + }, + { + "epoch": 2.606476938174681, + "grad_norm": 0.9955710768699646, + "learning_rate": 9.218733123985613e-05, + "loss": 0.0926, + "step": 39840 + }, + { + "epoch": 2.607131174353942, + "grad_norm": 0.7756005525588989, + "learning_rate": 9.218240004918958e-05, + "loss": 0.0792, + "step": 39850 + }, + { + "epoch": 2.6077854105332023, + "grad_norm": 0.7657015323638916, + "learning_rate": 9.21774674347564e-05, + "loss": 0.077, + "step": 39860 + }, + { + "epoch": 2.6084396467124633, + "grad_norm": 0.8866888880729675, + "learning_rate": 9.217253339672307e-05, + "loss": 0.0916, + "step": 39870 + }, + { + "epoch": 2.609093882891724, + "grad_norm": 1.0825079679489136, + "learning_rate": 9.216759793525615e-05, + "loss": 0.0777, + "step": 39880 + }, + { + "epoch": 2.609748119070985, + "grad_norm": 0.8313194513320923, + "learning_rate": 9.21626610505222e-05, + "loss": 0.0753, + "step": 39890 + }, + { + "epoch": 2.6104023552502453, + "grad_norm": 0.8642783164978027, + "learning_rate": 9.215772274268787e-05, + "loss": 0.0903, + "step": 39900 + }, + { + "epoch": 2.611056591429506, + "grad_norm": 0.8965114951133728, + "learning_rate": 9.215278301191982e-05, + "loss": 0.078, + "step": 39910 + }, + { + "epoch": 2.611710827608767, + "grad_norm": 0.7599003911018372, + "learning_rate": 9.214784185838483e-05, + "loss": 0.0928, + "step": 39920 + }, + { + "epoch": 2.6123650637880274, + "grad_norm": 1.0140457153320312, + "learning_rate": 9.21428992822496e-05, + "loss": 0.0797, + "step": 39930 + }, + { + "epoch": 2.6130192999672883, + "grad_norm": 0.9935689568519592, + "learning_rate": 9.213795528368102e-05, + "loss": 0.0964, + "step": 39940 + }, + { + "epoch": 2.613673536146549, + "grad_norm": 0.7364537715911865, + "learning_rate": 9.213300986284593e-05, + "loss": 0.0837, + "step": 39950 + }, + { + "epoch": 2.61432777232581, + "grad_norm": 0.7998794317245483, + "learning_rate": 9.212806301991125e-05, + "loss": 0.0843, + "step": 39960 + }, + { + "epoch": 2.6149820085050703, + "grad_norm": 0.8715366125106812, + "learning_rate": 9.212311475504398e-05, + "loss": 0.0864, + "step": 39970 + }, + { + "epoch": 2.615636244684331, + "grad_norm": 0.9843313694000244, + "learning_rate": 9.21181650684111e-05, + "loss": 0.0816, + "step": 39980 + }, + { + "epoch": 2.616290480863592, + "grad_norm": 0.8258795738220215, + "learning_rate": 9.21132139601797e-05, + "loss": 0.0795, + "step": 39990 + }, + { + "epoch": 2.6169447170428524, + "grad_norm": 0.8979169130325317, + "learning_rate": 9.210826143051688e-05, + "loss": 0.0815, + "step": 40000 + }, + { + "epoch": 2.6175989532221133, + "grad_norm": 0.8749635815620422, + "learning_rate": 9.210330747958979e-05, + "loss": 0.0771, + "step": 40010 + }, + { + "epoch": 2.618253189401374, + "grad_norm": 0.8296936750411987, + "learning_rate": 9.209835210756565e-05, + "loss": 0.0793, + "step": 40020 + }, + { + "epoch": 2.618907425580635, + "grad_norm": 0.7305333614349365, + "learning_rate": 9.209339531461173e-05, + "loss": 0.0747, + "step": 40030 + }, + { + "epoch": 2.6195616617598954, + "grad_norm": 0.888248860836029, + "learning_rate": 9.208843710089534e-05, + "loss": 0.0855, + "step": 40040 + }, + { + "epoch": 2.620215897939156, + "grad_norm": 0.7976614236831665, + "learning_rate": 9.20834774665838e-05, + "loss": 0.0876, + "step": 40050 + }, + { + "epoch": 2.620870134118417, + "grad_norm": 1.2806037664413452, + "learning_rate": 9.207851641184453e-05, + "loss": 0.0805, + "step": 40060 + }, + { + "epoch": 2.6215243702976774, + "grad_norm": 0.7971034049987793, + "learning_rate": 9.207355393684499e-05, + "loss": 0.0706, + "step": 40070 + }, + { + "epoch": 2.622178606476938, + "grad_norm": 0.7905651330947876, + "learning_rate": 9.206859004175264e-05, + "loss": 0.0784, + "step": 40080 + }, + { + "epoch": 2.622832842656199, + "grad_norm": 0.7352070808410645, + "learning_rate": 9.206362472673505e-05, + "loss": 0.0723, + "step": 40090 + }, + { + "epoch": 2.62348707883546, + "grad_norm": 0.9722936153411865, + "learning_rate": 9.205865799195982e-05, + "loss": 0.081, + "step": 40100 + }, + { + "epoch": 2.6241413150147204, + "grad_norm": 0.849582850933075, + "learning_rate": 9.205368983759457e-05, + "loss": 0.0933, + "step": 40110 + }, + { + "epoch": 2.624795551193981, + "grad_norm": 0.7181478142738342, + "learning_rate": 9.2048720263807e-05, + "loss": 0.0824, + "step": 40120 + }, + { + "epoch": 2.625449787373242, + "grad_norm": 0.8331068158149719, + "learning_rate": 9.204374927076486e-05, + "loss": 0.0774, + "step": 40130 + }, + { + "epoch": 2.6261040235525024, + "grad_norm": 0.7740342617034912, + "learning_rate": 9.20387768586359e-05, + "loss": 0.0791, + "step": 40140 + }, + { + "epoch": 2.626758259731763, + "grad_norm": 0.7920116186141968, + "learning_rate": 9.203380302758797e-05, + "loss": 0.0765, + "step": 40150 + }, + { + "epoch": 2.627412495911024, + "grad_norm": 0.8771019577980042, + "learning_rate": 9.202882777778896e-05, + "loss": 0.08, + "step": 40160 + }, + { + "epoch": 2.628066732090285, + "grad_norm": 0.9749835729598999, + "learning_rate": 9.202385110940678e-05, + "loss": 0.0779, + "step": 40170 + }, + { + "epoch": 2.6287209682695454, + "grad_norm": 0.8514975309371948, + "learning_rate": 9.201887302260943e-05, + "loss": 0.0887, + "step": 40180 + }, + { + "epoch": 2.629375204448806, + "grad_norm": 0.820841372013092, + "learning_rate": 9.201389351756491e-05, + "loss": 0.0747, + "step": 40190 + }, + { + "epoch": 2.630029440628067, + "grad_norm": 0.9230198860168457, + "learning_rate": 9.200891259444129e-05, + "loss": 0.0867, + "step": 40200 + }, + { + "epoch": 2.6306836768073274, + "grad_norm": 0.7532345056533813, + "learning_rate": 9.20039302534067e-05, + "loss": 0.0861, + "step": 40210 + }, + { + "epoch": 2.631337912986588, + "grad_norm": 0.7638508677482605, + "learning_rate": 9.19989464946293e-05, + "loss": 0.087, + "step": 40220 + }, + { + "epoch": 2.631992149165849, + "grad_norm": 0.7697664499282837, + "learning_rate": 9.199396131827731e-05, + "loss": 0.0725, + "step": 40230 + }, + { + "epoch": 2.6326463853451094, + "grad_norm": 0.7672895193099976, + "learning_rate": 9.198897472451901e-05, + "loss": 0.086, + "step": 40240 + }, + { + "epoch": 2.6333006215243704, + "grad_norm": 0.830986499786377, + "learning_rate": 9.198398671352267e-05, + "loss": 0.0793, + "step": 40250 + }, + { + "epoch": 2.633954857703631, + "grad_norm": 0.8693823218345642, + "learning_rate": 9.19789972854567e-05, + "loss": 0.0868, + "step": 40260 + }, + { + "epoch": 2.634609093882892, + "grad_norm": 0.8019348978996277, + "learning_rate": 9.197400644048944e-05, + "loss": 0.0741, + "step": 40270 + }, + { + "epoch": 2.6352633300621524, + "grad_norm": 0.9302101731300354, + "learning_rate": 9.196901417878941e-05, + "loss": 0.0779, + "step": 40280 + }, + { + "epoch": 2.635917566241413, + "grad_norm": 0.9065577387809753, + "learning_rate": 9.196402050052507e-05, + "loss": 0.0985, + "step": 40290 + }, + { + "epoch": 2.636571802420674, + "grad_norm": 0.8283711671829224, + "learning_rate": 9.195902540586498e-05, + "loss": 0.0797, + "step": 40300 + }, + { + "epoch": 2.6372260385999344, + "grad_norm": 0.9816059470176697, + "learning_rate": 9.195402889497777e-05, + "loss": 0.0925, + "step": 40310 + }, + { + "epoch": 2.6378802747791954, + "grad_norm": 0.7747599482536316, + "learning_rate": 9.194903096803202e-05, + "loss": 0.0841, + "step": 40320 + }, + { + "epoch": 2.638534510958456, + "grad_norm": 0.9950234889984131, + "learning_rate": 9.194403162519648e-05, + "loss": 0.0904, + "step": 40330 + }, + { + "epoch": 2.639188747137717, + "grad_norm": 1.1040619611740112, + "learning_rate": 9.193903086663987e-05, + "loss": 0.0822, + "step": 40340 + }, + { + "epoch": 2.6398429833169774, + "grad_norm": 1.090773582458496, + "learning_rate": 9.193402869253097e-05, + "loss": 0.0944, + "step": 40350 + }, + { + "epoch": 2.640497219496238, + "grad_norm": 1.1542222499847412, + "learning_rate": 9.192902510303862e-05, + "loss": 0.0864, + "step": 40360 + }, + { + "epoch": 2.641151455675499, + "grad_norm": 0.9892890453338623, + "learning_rate": 9.192402009833173e-05, + "loss": 0.0861, + "step": 40370 + }, + { + "epoch": 2.6418056918547594, + "grad_norm": 1.2127673625946045, + "learning_rate": 9.19190136785792e-05, + "loss": 0.0837, + "step": 40380 + }, + { + "epoch": 2.6424599280340204, + "grad_norm": 0.8235146999359131, + "learning_rate": 9.191400584395003e-05, + "loss": 0.0825, + "step": 40390 + }, + { + "epoch": 2.643114164213281, + "grad_norm": 0.7639740109443665, + "learning_rate": 9.190899659461323e-05, + "loss": 0.0767, + "step": 40400 + }, + { + "epoch": 2.643768400392542, + "grad_norm": 0.8715619444847107, + "learning_rate": 9.19039859307379e-05, + "loss": 0.0795, + "step": 40410 + }, + { + "epoch": 2.6444226365718024, + "grad_norm": 0.8103946447372437, + "learning_rate": 9.189897385249313e-05, + "loss": 0.0898, + "step": 40420 + }, + { + "epoch": 2.645076872751063, + "grad_norm": 0.9815798997879028, + "learning_rate": 9.189396036004811e-05, + "loss": 0.0782, + "step": 40430 + }, + { + "epoch": 2.645731108930324, + "grad_norm": 0.884543776512146, + "learning_rate": 9.188894545357207e-05, + "loss": 0.0775, + "step": 40440 + }, + { + "epoch": 2.6463853451095845, + "grad_norm": 0.8589003682136536, + "learning_rate": 9.188392913323423e-05, + "loss": 0.0867, + "step": 40450 + }, + { + "epoch": 2.6470395812888454, + "grad_norm": 0.8187326192855835, + "learning_rate": 9.187891139920397e-05, + "loss": 0.081, + "step": 40460 + }, + { + "epoch": 2.647693817468106, + "grad_norm": 0.8445239663124084, + "learning_rate": 9.187389225165062e-05, + "loss": 0.0812, + "step": 40470 + }, + { + "epoch": 2.648348053647367, + "grad_norm": 0.7465826869010925, + "learning_rate": 9.186887169074356e-05, + "loss": 0.08, + "step": 40480 + }, + { + "epoch": 2.6490022898266274, + "grad_norm": 0.8137046098709106, + "learning_rate": 9.186384971665229e-05, + "loss": 0.0792, + "step": 40490 + }, + { + "epoch": 2.649656526005888, + "grad_norm": 0.912079393863678, + "learning_rate": 9.185882632954632e-05, + "loss": 0.0809, + "step": 40500 + }, + { + "epoch": 2.650310762185149, + "grad_norm": 0.9054073691368103, + "learning_rate": 9.185380152959515e-05, + "loss": 0.0777, + "step": 40510 + }, + { + "epoch": 2.6509649983644095, + "grad_norm": 0.9537298083305359, + "learning_rate": 9.184877531696844e-05, + "loss": 0.0822, + "step": 40520 + }, + { + "epoch": 2.65161923454367, + "grad_norm": 0.8448399901390076, + "learning_rate": 9.18437476918358e-05, + "loss": 0.0788, + "step": 40530 + }, + { + "epoch": 2.652273470722931, + "grad_norm": 0.8022094368934631, + "learning_rate": 9.183871865436693e-05, + "loss": 0.0927, + "step": 40540 + }, + { + "epoch": 2.652927706902192, + "grad_norm": 0.7613011598587036, + "learning_rate": 9.183368820473159e-05, + "loss": 0.0743, + "step": 40550 + }, + { + "epoch": 2.6535819430814525, + "grad_norm": 0.8020244836807251, + "learning_rate": 9.182865634309956e-05, + "loss": 0.0729, + "step": 40560 + }, + { + "epoch": 2.654236179260713, + "grad_norm": 0.7869201898574829, + "learning_rate": 9.182362306964067e-05, + "loss": 0.0818, + "step": 40570 + }, + { + "epoch": 2.654890415439974, + "grad_norm": 0.859542965888977, + "learning_rate": 9.181858838452481e-05, + "loss": 0.0909, + "step": 40580 + }, + { + "epoch": 2.6555446516192345, + "grad_norm": 0.8157853484153748, + "learning_rate": 9.181355228792194e-05, + "loss": 0.0727, + "step": 40590 + }, + { + "epoch": 2.656198887798495, + "grad_norm": 0.8959032297134399, + "learning_rate": 9.180851478000199e-05, + "loss": 0.0762, + "step": 40600 + }, + { + "epoch": 2.656853123977756, + "grad_norm": 0.759432315826416, + "learning_rate": 9.180347586093505e-05, + "loss": 0.0805, + "step": 40610 + }, + { + "epoch": 2.657507360157017, + "grad_norm": 0.842648983001709, + "learning_rate": 9.179843553089114e-05, + "loss": 0.0766, + "step": 40620 + }, + { + "epoch": 2.6581615963362775, + "grad_norm": 0.8938133120536804, + "learning_rate": 9.179339379004043e-05, + "loss": 0.0839, + "step": 40630 + }, + { + "epoch": 2.658815832515538, + "grad_norm": 1.022493600845337, + "learning_rate": 9.178835063855306e-05, + "loss": 0.0812, + "step": 40640 + }, + { + "epoch": 2.659470068694799, + "grad_norm": 0.8210968375205994, + "learning_rate": 9.178330607659927e-05, + "loss": 0.0804, + "step": 40650 + }, + { + "epoch": 2.6601243048740595, + "grad_norm": 0.7599284648895264, + "learning_rate": 9.177826010434931e-05, + "loss": 0.0731, + "step": 40660 + }, + { + "epoch": 2.66077854105332, + "grad_norm": 0.8126060962677002, + "learning_rate": 9.177321272197352e-05, + "loss": 0.079, + "step": 40670 + }, + { + "epoch": 2.661432777232581, + "grad_norm": 0.9725340604782104, + "learning_rate": 9.176816392964223e-05, + "loss": 0.0808, + "step": 40680 + }, + { + "epoch": 2.662087013411842, + "grad_norm": 0.7665135264396667, + "learning_rate": 9.176311372752589e-05, + "loss": 0.0781, + "step": 40690 + }, + { + "epoch": 2.6627412495911025, + "grad_norm": 0.9257283806800842, + "learning_rate": 9.175806211579491e-05, + "loss": 0.088, + "step": 40700 + }, + { + "epoch": 2.663395485770363, + "grad_norm": 0.9089275598526001, + "learning_rate": 9.175300909461982e-05, + "loss": 0.0823, + "step": 40710 + }, + { + "epoch": 2.664049721949624, + "grad_norm": 0.8126506209373474, + "learning_rate": 9.174795466417119e-05, + "loss": 0.0721, + "step": 40720 + }, + { + "epoch": 2.6647039581288845, + "grad_norm": 1.0364658832550049, + "learning_rate": 9.17428988246196e-05, + "loss": 0.0827, + "step": 40730 + }, + { + "epoch": 2.665358194308145, + "grad_norm": 0.8109714984893799, + "learning_rate": 9.173784157613568e-05, + "loss": 0.0815, + "step": 40740 + }, + { + "epoch": 2.666012430487406, + "grad_norm": 0.8420194387435913, + "learning_rate": 9.173278291889015e-05, + "loss": 0.0824, + "step": 40750 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.7918026447296143, + "learning_rate": 9.172772285305375e-05, + "loss": 0.0894, + "step": 40760 + }, + { + "epoch": 2.6673209028459275, + "grad_norm": 0.9323230385780334, + "learning_rate": 9.172266137879728e-05, + "loss": 0.0864, + "step": 40770 + }, + { + "epoch": 2.667975139025188, + "grad_norm": 0.8225651383399963, + "learning_rate": 9.171759849629155e-05, + "loss": 0.0806, + "step": 40780 + }, + { + "epoch": 2.668629375204449, + "grad_norm": 0.9506831765174866, + "learning_rate": 9.171253420570748e-05, + "loss": 0.0861, + "step": 40790 + }, + { + "epoch": 2.6692836113837095, + "grad_norm": 0.8589584827423096, + "learning_rate": 9.170746850721598e-05, + "loss": 0.0791, + "step": 40800 + }, + { + "epoch": 2.66993784756297, + "grad_norm": 0.9063448309898376, + "learning_rate": 9.170240140098802e-05, + "loss": 0.0776, + "step": 40810 + }, + { + "epoch": 2.670592083742231, + "grad_norm": 0.7764676809310913, + "learning_rate": 9.169733288719467e-05, + "loss": 0.0784, + "step": 40820 + }, + { + "epoch": 2.6712463199214915, + "grad_norm": 0.8123670816421509, + "learning_rate": 9.169226296600696e-05, + "loss": 0.0658, + "step": 40830 + }, + { + "epoch": 2.6719005561007525, + "grad_norm": 0.8324966430664062, + "learning_rate": 9.168719163759604e-05, + "loss": 0.0741, + "step": 40840 + }, + { + "epoch": 2.672554792280013, + "grad_norm": 0.925599217414856, + "learning_rate": 9.168211890213307e-05, + "loss": 0.0765, + "step": 40850 + }, + { + "epoch": 2.673209028459274, + "grad_norm": 1.0423486232757568, + "learning_rate": 9.167704475978928e-05, + "loss": 0.0788, + "step": 40860 + }, + { + "epoch": 2.6738632646385345, + "grad_norm": 0.8119267821311951, + "learning_rate": 9.167196921073593e-05, + "loss": 0.0855, + "step": 40870 + }, + { + "epoch": 2.674517500817795, + "grad_norm": 0.9077923893928528, + "learning_rate": 9.166689225514431e-05, + "loss": 0.0715, + "step": 40880 + }, + { + "epoch": 2.675171736997056, + "grad_norm": 0.9296697378158569, + "learning_rate": 9.166181389318583e-05, + "loss": 0.0768, + "step": 40890 + }, + { + "epoch": 2.6758259731763165, + "grad_norm": 0.7002086043357849, + "learning_rate": 9.165673412503183e-05, + "loss": 0.0896, + "step": 40900 + }, + { + "epoch": 2.6764802093555775, + "grad_norm": 0.7667686939239502, + "learning_rate": 9.165165295085385e-05, + "loss": 0.0858, + "step": 40910 + }, + { + "epoch": 2.677134445534838, + "grad_norm": 1.128864049911499, + "learning_rate": 9.164657037082331e-05, + "loss": 0.0809, + "step": 40920 + }, + { + "epoch": 2.677788681714099, + "grad_norm": 0.8613152503967285, + "learning_rate": 9.164148638511182e-05, + "loss": 0.0882, + "step": 40930 + }, + { + "epoch": 2.6784429178933595, + "grad_norm": 0.7017923593521118, + "learning_rate": 9.163640099389095e-05, + "loss": 0.0807, + "step": 40940 + }, + { + "epoch": 2.67909715407262, + "grad_norm": 0.8696222305297852, + "learning_rate": 9.163131419733235e-05, + "loss": 0.0817, + "step": 40950 + }, + { + "epoch": 2.679751390251881, + "grad_norm": 0.950098991394043, + "learning_rate": 9.16262259956077e-05, + "loss": 0.0775, + "step": 40960 + }, + { + "epoch": 2.6804056264311416, + "grad_norm": 1.0551055669784546, + "learning_rate": 9.162113638888879e-05, + "loss": 0.0838, + "step": 40970 + }, + { + "epoch": 2.681059862610402, + "grad_norm": 0.8011052012443542, + "learning_rate": 9.161604537734733e-05, + "loss": 0.0891, + "step": 40980 + }, + { + "epoch": 2.681714098789663, + "grad_norm": 0.9275655746459961, + "learning_rate": 9.161095296115523e-05, + "loss": 0.0828, + "step": 40990 + }, + { + "epoch": 2.682368334968924, + "grad_norm": 1.1314018964767456, + "learning_rate": 9.160585914048432e-05, + "loss": 0.0815, + "step": 41000 + }, + { + "epoch": 2.6830225711481845, + "grad_norm": 0.9840704202651978, + "learning_rate": 9.160076391550654e-05, + "loss": 0.0804, + "step": 41010 + }, + { + "epoch": 2.683676807327445, + "grad_norm": 0.8175294399261475, + "learning_rate": 9.15956672863939e-05, + "loss": 0.0729, + "step": 41020 + }, + { + "epoch": 2.684331043506706, + "grad_norm": 0.7896432280540466, + "learning_rate": 9.159056925331837e-05, + "loss": 0.079, + "step": 41030 + }, + { + "epoch": 2.6849852796859666, + "grad_norm": 0.7831984758377075, + "learning_rate": 9.158546981645208e-05, + "loss": 0.0845, + "step": 41040 + }, + { + "epoch": 2.685639515865227, + "grad_norm": 0.8683128952980042, + "learning_rate": 9.158036897596712e-05, + "loss": 0.0869, + "step": 41050 + }, + { + "epoch": 2.686293752044488, + "grad_norm": 0.8791904449462891, + "learning_rate": 9.157526673203565e-05, + "loss": 0.0835, + "step": 41060 + }, + { + "epoch": 2.686947988223749, + "grad_norm": 0.7771376371383667, + "learning_rate": 9.15701630848299e-05, + "loss": 0.0825, + "step": 41070 + }, + { + "epoch": 2.6876022244030096, + "grad_norm": 0.6314225792884827, + "learning_rate": 9.156505803452212e-05, + "loss": 0.0699, + "step": 41080 + }, + { + "epoch": 2.68825646058227, + "grad_norm": 1.073372483253479, + "learning_rate": 9.15599515812846e-05, + "loss": 0.0892, + "step": 41090 + }, + { + "epoch": 2.688910696761531, + "grad_norm": 0.9210318922996521, + "learning_rate": 9.155484372528975e-05, + "loss": 0.0779, + "step": 41100 + }, + { + "epoch": 2.6895649329407916, + "grad_norm": 0.8791244626045227, + "learning_rate": 9.154973446670993e-05, + "loss": 0.0981, + "step": 41110 + }, + { + "epoch": 2.690219169120052, + "grad_norm": 0.8100093007087708, + "learning_rate": 9.154462380571761e-05, + "loss": 0.0867, + "step": 41120 + }, + { + "epoch": 2.690873405299313, + "grad_norm": 1.020846962928772, + "learning_rate": 9.153951174248528e-05, + "loss": 0.0945, + "step": 41130 + }, + { + "epoch": 2.691527641478574, + "grad_norm": 0.929047167301178, + "learning_rate": 9.153439827718549e-05, + "loss": 0.0766, + "step": 41140 + }, + { + "epoch": 2.6921818776578346, + "grad_norm": 0.8004492521286011, + "learning_rate": 9.152928340999083e-05, + "loss": 0.0754, + "step": 41150 + }, + { + "epoch": 2.692836113837095, + "grad_norm": 0.7606692910194397, + "learning_rate": 9.152416714107393e-05, + "loss": 0.0761, + "step": 41160 + }, + { + "epoch": 2.693490350016356, + "grad_norm": 0.9273959994316101, + "learning_rate": 9.15190494706075e-05, + "loss": 0.0838, + "step": 41170 + }, + { + "epoch": 2.6941445861956166, + "grad_norm": 0.8332855701446533, + "learning_rate": 9.151393039876425e-05, + "loss": 0.0781, + "step": 41180 + }, + { + "epoch": 2.694798822374877, + "grad_norm": 0.6218408942222595, + "learning_rate": 9.150880992571698e-05, + "loss": 0.0824, + "step": 41190 + }, + { + "epoch": 2.695453058554138, + "grad_norm": 0.8714901804924011, + "learning_rate": 9.150368805163851e-05, + "loss": 0.0875, + "step": 41200 + }, + { + "epoch": 2.6961072947333986, + "grad_norm": 0.7702926397323608, + "learning_rate": 9.149856477670173e-05, + "loss": 0.0842, + "step": 41210 + }, + { + "epoch": 2.6967615309126596, + "grad_norm": 0.8732930421829224, + "learning_rate": 9.149344010107955e-05, + "loss": 0.086, + "step": 41220 + }, + { + "epoch": 2.69741576709192, + "grad_norm": 0.8141332268714905, + "learning_rate": 9.148831402494495e-05, + "loss": 0.0917, + "step": 41230 + }, + { + "epoch": 2.698070003271181, + "grad_norm": 0.9626291990280151, + "learning_rate": 9.148318654847094e-05, + "loss": 0.0798, + "step": 41240 + }, + { + "epoch": 2.6987242394504416, + "grad_norm": 0.8662136793136597, + "learning_rate": 9.14780576718306e-05, + "loss": 0.0837, + "step": 41250 + }, + { + "epoch": 2.699378475629702, + "grad_norm": 0.842068076133728, + "learning_rate": 9.147292739519702e-05, + "loss": 0.0848, + "step": 41260 + }, + { + "epoch": 2.700032711808963, + "grad_norm": 0.8036767244338989, + "learning_rate": 9.146779571874337e-05, + "loss": 0.071, + "step": 41270 + }, + { + "epoch": 2.7006869479882236, + "grad_norm": 0.8152961134910583, + "learning_rate": 9.146266264264288e-05, + "loss": 0.0795, + "step": 41280 + }, + { + "epoch": 2.7013411841674846, + "grad_norm": 0.9094104766845703, + "learning_rate": 9.145752816706878e-05, + "loss": 0.0715, + "step": 41290 + }, + { + "epoch": 2.701995420346745, + "grad_norm": 0.8534713983535767, + "learning_rate": 9.145239229219438e-05, + "loss": 0.0788, + "step": 41300 + }, + { + "epoch": 2.702649656526006, + "grad_norm": 0.947786271572113, + "learning_rate": 9.144725501819303e-05, + "loss": 0.0888, + "step": 41310 + }, + { + "epoch": 2.7033038927052666, + "grad_norm": 0.9250187277793884, + "learning_rate": 9.14421163452381e-05, + "loss": 0.0849, + "step": 41320 + }, + { + "epoch": 2.703958128884527, + "grad_norm": 0.9689621925354004, + "learning_rate": 9.143697627350308e-05, + "loss": 0.0792, + "step": 41330 + }, + { + "epoch": 2.704612365063788, + "grad_norm": 0.9268077611923218, + "learning_rate": 9.143183480316143e-05, + "loss": 0.079, + "step": 41340 + }, + { + "epoch": 2.7052666012430486, + "grad_norm": 0.7102203369140625, + "learning_rate": 9.142669193438669e-05, + "loss": 0.0812, + "step": 41350 + }, + { + "epoch": 2.7059208374223096, + "grad_norm": 0.8151041269302368, + "learning_rate": 9.142154766735247e-05, + "loss": 0.0776, + "step": 41360 + }, + { + "epoch": 2.70657507360157, + "grad_norm": 1.0687074661254883, + "learning_rate": 9.141640200223236e-05, + "loss": 0.0804, + "step": 41370 + }, + { + "epoch": 2.707229309780831, + "grad_norm": 0.9339028596878052, + "learning_rate": 9.141125493920009e-05, + "loss": 0.0873, + "step": 41380 + }, + { + "epoch": 2.7078835459600916, + "grad_norm": 0.8078384399414062, + "learning_rate": 9.140610647842934e-05, + "loss": 0.0742, + "step": 41390 + }, + { + "epoch": 2.708537782139352, + "grad_norm": 1.0456902980804443, + "learning_rate": 9.140095662009392e-05, + "loss": 0.0764, + "step": 41400 + }, + { + "epoch": 2.709192018318613, + "grad_norm": 0.9626041650772095, + "learning_rate": 9.139580536436763e-05, + "loss": 0.075, + "step": 41410 + }, + { + "epoch": 2.7098462544978736, + "grad_norm": 1.0236674547195435, + "learning_rate": 9.139065271142434e-05, + "loss": 0.0861, + "step": 41420 + }, + { + "epoch": 2.7105004906771346, + "grad_norm": 0.8825036287307739, + "learning_rate": 9.138549866143797e-05, + "loss": 0.0693, + "step": 41430 + }, + { + "epoch": 2.711154726856395, + "grad_norm": 1.041279911994934, + "learning_rate": 9.138034321458248e-05, + "loss": 0.0874, + "step": 41440 + }, + { + "epoch": 2.711808963035656, + "grad_norm": 0.7983617782592773, + "learning_rate": 9.13751863710319e-05, + "loss": 0.082, + "step": 41450 + }, + { + "epoch": 2.7124631992149166, + "grad_norm": 0.7135721445083618, + "learning_rate": 9.137002813096026e-05, + "loss": 0.0747, + "step": 41460 + }, + { + "epoch": 2.713117435394177, + "grad_norm": 0.9826575517654419, + "learning_rate": 9.136486849454167e-05, + "loss": 0.0825, + "step": 41470 + }, + { + "epoch": 2.713771671573438, + "grad_norm": 0.9839840531349182, + "learning_rate": 9.135970746195029e-05, + "loss": 0.0923, + "step": 41480 + }, + { + "epoch": 2.7144259077526987, + "grad_norm": 0.9663382172584534, + "learning_rate": 9.135454503336032e-05, + "loss": 0.0676, + "step": 41490 + }, + { + "epoch": 2.715080143931959, + "grad_norm": 0.8061355352401733, + "learning_rate": 9.134938120894598e-05, + "loss": 0.0809, + "step": 41500 + }, + { + "epoch": 2.71573438011122, + "grad_norm": 0.8093533515930176, + "learning_rate": 9.134421598888159e-05, + "loss": 0.0772, + "step": 41510 + }, + { + "epoch": 2.716388616290481, + "grad_norm": 0.7529310584068298, + "learning_rate": 9.133904937334148e-05, + "loss": 0.0766, + "step": 41520 + }, + { + "epoch": 2.7170428524697416, + "grad_norm": 0.7155537605285645, + "learning_rate": 9.133388136250005e-05, + "loss": 0.0707, + "step": 41530 + }, + { + "epoch": 2.717697088649002, + "grad_norm": 0.8347744345664978, + "learning_rate": 9.13287119565317e-05, + "loss": 0.0737, + "step": 41540 + }, + { + "epoch": 2.718351324828263, + "grad_norm": 0.7836735844612122, + "learning_rate": 9.132354115561094e-05, + "loss": 0.0874, + "step": 41550 + }, + { + "epoch": 2.7190055610075237, + "grad_norm": 0.8764668107032776, + "learning_rate": 9.13183689599123e-05, + "loss": 0.0857, + "step": 41560 + }, + { + "epoch": 2.719659797186784, + "grad_norm": 0.7778729200363159, + "learning_rate": 9.131319536961035e-05, + "loss": 0.0865, + "step": 41570 + }, + { + "epoch": 2.720314033366045, + "grad_norm": 0.8253190517425537, + "learning_rate": 9.13080203848797e-05, + "loss": 0.0805, + "step": 41580 + }, + { + "epoch": 2.720968269545306, + "grad_norm": 0.7873772382736206, + "learning_rate": 9.130284400589503e-05, + "loss": 0.0815, + "step": 41590 + }, + { + "epoch": 2.7216225057245667, + "grad_norm": 0.7975731492042542, + "learning_rate": 9.129766623283105e-05, + "loss": 0.0859, + "step": 41600 + }, + { + "epoch": 2.722276741903827, + "grad_norm": 0.8026714324951172, + "learning_rate": 9.129248706586253e-05, + "loss": 0.0773, + "step": 41610 + }, + { + "epoch": 2.722930978083088, + "grad_norm": 0.8112697005271912, + "learning_rate": 9.128730650516429e-05, + "loss": 0.0743, + "step": 41620 + }, + { + "epoch": 2.7235852142623487, + "grad_norm": 0.8520511388778687, + "learning_rate": 9.128212455091115e-05, + "loss": 0.0847, + "step": 41630 + }, + { + "epoch": 2.724239450441609, + "grad_norm": 0.8587812185287476, + "learning_rate": 9.127694120327806e-05, + "loss": 0.0889, + "step": 41640 + }, + { + "epoch": 2.72489368662087, + "grad_norm": 1.0693318843841553, + "learning_rate": 9.127175646243994e-05, + "loss": 0.0834, + "step": 41650 + }, + { + "epoch": 2.7255479228001307, + "grad_norm": 0.8526062369346619, + "learning_rate": 9.126657032857181e-05, + "loss": 0.0772, + "step": 41660 + }, + { + "epoch": 2.7262021589793917, + "grad_norm": 1.0225043296813965, + "learning_rate": 9.126138280184871e-05, + "loss": 0.0982, + "step": 41670 + }, + { + "epoch": 2.726856395158652, + "grad_norm": 0.7854475378990173, + "learning_rate": 9.125619388244571e-05, + "loss": 0.0768, + "step": 41680 + }, + { + "epoch": 2.727510631337913, + "grad_norm": 0.929229736328125, + "learning_rate": 9.125100357053797e-05, + "loss": 0.0778, + "step": 41690 + }, + { + "epoch": 2.7281648675171737, + "grad_norm": 0.7836943864822388, + "learning_rate": 9.124581186630071e-05, + "loss": 0.0752, + "step": 41700 + }, + { + "epoch": 2.728819103696434, + "grad_norm": 0.9992619156837463, + "learning_rate": 9.124061876990908e-05, + "loss": 0.0795, + "step": 41710 + }, + { + "epoch": 2.729473339875695, + "grad_norm": 0.8057607412338257, + "learning_rate": 9.123542428153844e-05, + "loss": 0.072, + "step": 41720 + }, + { + "epoch": 2.7301275760549557, + "grad_norm": 0.8987215161323547, + "learning_rate": 9.123022840136407e-05, + "loss": 0.0805, + "step": 41730 + }, + { + "epoch": 2.7307818122342167, + "grad_norm": 0.8965569734573364, + "learning_rate": 9.122503112956138e-05, + "loss": 0.0797, + "step": 41740 + }, + { + "epoch": 2.731436048413477, + "grad_norm": 0.7639127969741821, + "learning_rate": 9.121983246630575e-05, + "loss": 0.0801, + "step": 41750 + }, + { + "epoch": 2.732090284592738, + "grad_norm": 0.83674156665802, + "learning_rate": 9.121463241177269e-05, + "loss": 0.0802, + "step": 41760 + }, + { + "epoch": 2.7327445207719987, + "grad_norm": 1.0172474384307861, + "learning_rate": 9.120943096613768e-05, + "loss": 0.0752, + "step": 41770 + }, + { + "epoch": 2.7333987569512592, + "grad_norm": 0.9789382815361023, + "learning_rate": 9.120422812957632e-05, + "loss": 0.0932, + "step": 41780 + }, + { + "epoch": 2.73405299313052, + "grad_norm": 0.934238612651825, + "learning_rate": 9.119902390226418e-05, + "loss": 0.0801, + "step": 41790 + }, + { + "epoch": 2.7347072293097807, + "grad_norm": 0.7868991494178772, + "learning_rate": 9.119381828437694e-05, + "loss": 0.0683, + "step": 41800 + }, + { + "epoch": 2.7353614654890417, + "grad_norm": 0.7209054827690125, + "learning_rate": 9.118861127609031e-05, + "loss": 0.0724, + "step": 41810 + }, + { + "epoch": 2.736015701668302, + "grad_norm": 0.8778037428855896, + "learning_rate": 9.118340287758001e-05, + "loss": 0.0798, + "step": 41820 + }, + { + "epoch": 2.736669937847563, + "grad_norm": 0.9034239053726196, + "learning_rate": 9.117819308902186e-05, + "loss": 0.0832, + "step": 41830 + }, + { + "epoch": 2.7373241740268237, + "grad_norm": 0.9231058359146118, + "learning_rate": 9.117298191059171e-05, + "loss": 0.0784, + "step": 41840 + }, + { + "epoch": 2.7379784102060842, + "grad_norm": 0.8640102744102478, + "learning_rate": 9.116776934246543e-05, + "loss": 0.0845, + "step": 41850 + }, + { + "epoch": 2.738632646385345, + "grad_norm": 0.9153121709823608, + "learning_rate": 9.116255538481896e-05, + "loss": 0.0832, + "step": 41860 + }, + { + "epoch": 2.7392868825646057, + "grad_norm": 0.7354351282119751, + "learning_rate": 9.115734003782832e-05, + "loss": 0.0759, + "step": 41870 + }, + { + "epoch": 2.7399411187438667, + "grad_norm": 0.9227734804153442, + "learning_rate": 9.115212330166949e-05, + "loss": 0.0839, + "step": 41880 + }, + { + "epoch": 2.7405953549231272, + "grad_norm": 0.8186841011047363, + "learning_rate": 9.114690517651859e-05, + "loss": 0.0802, + "step": 41890 + }, + { + "epoch": 2.741249591102388, + "grad_norm": 0.8185954093933105, + "learning_rate": 9.114168566255172e-05, + "loss": 0.0824, + "step": 41900 + }, + { + "epoch": 2.7419038272816487, + "grad_norm": 1.1469035148620605, + "learning_rate": 9.113646475994506e-05, + "loss": 0.0783, + "step": 41910 + }, + { + "epoch": 2.7425580634609092, + "grad_norm": 0.8847156763076782, + "learning_rate": 9.113124246887483e-05, + "loss": 0.0723, + "step": 41920 + }, + { + "epoch": 2.74321229964017, + "grad_norm": 0.870272696018219, + "learning_rate": 9.11260187895173e-05, + "loss": 0.0812, + "step": 41930 + }, + { + "epoch": 2.7438665358194307, + "grad_norm": 1.0734666585922241, + "learning_rate": 9.112079372204877e-05, + "loss": 0.0863, + "step": 41940 + }, + { + "epoch": 2.7445207719986913, + "grad_norm": 0.6786089539527893, + "learning_rate": 9.111556726664563e-05, + "loss": 0.0796, + "step": 41950 + }, + { + "epoch": 2.7451750081779522, + "grad_norm": 0.8662351369857788, + "learning_rate": 9.111033942348426e-05, + "loss": 0.0884, + "step": 41960 + }, + { + "epoch": 2.745829244357213, + "grad_norm": 0.7827929854393005, + "learning_rate": 9.110511019274111e-05, + "loss": 0.0764, + "step": 41970 + }, + { + "epoch": 2.7464834805364737, + "grad_norm": 1.1550228595733643, + "learning_rate": 9.109987957459267e-05, + "loss": 0.0845, + "step": 41980 + }, + { + "epoch": 2.7471377167157343, + "grad_norm": 0.8338910937309265, + "learning_rate": 9.109464756921553e-05, + "loss": 0.0824, + "step": 41990 + }, + { + "epoch": 2.7477919528949952, + "grad_norm": 0.9103622436523438, + "learning_rate": 9.108941417678626e-05, + "loss": 0.0896, + "step": 42000 + }, + { + "epoch": 2.7484461890742558, + "grad_norm": 0.8045547008514404, + "learning_rate": 9.108417939748149e-05, + "loss": 0.0805, + "step": 42010 + }, + { + "epoch": 2.7491004252535163, + "grad_norm": 0.8297812342643738, + "learning_rate": 9.107894323147792e-05, + "loss": 0.0819, + "step": 42020 + }, + { + "epoch": 2.7497546614327772, + "grad_norm": 0.8854258060455322, + "learning_rate": 9.107370567895229e-05, + "loss": 0.0837, + "step": 42030 + }, + { + "epoch": 2.750408897612038, + "grad_norm": 0.8766602277755737, + "learning_rate": 9.106846674008137e-05, + "loss": 0.0776, + "step": 42040 + }, + { + "epoch": 2.7510631337912987, + "grad_norm": 1.0775314569473267, + "learning_rate": 9.1063226415042e-05, + "loss": 0.0909, + "step": 42050 + }, + { + "epoch": 2.7517173699705593, + "grad_norm": 0.7865322828292847, + "learning_rate": 9.105798470401103e-05, + "loss": 0.0914, + "step": 42060 + }, + { + "epoch": 2.7523716061498202, + "grad_norm": 0.9470481872558594, + "learning_rate": 9.10527416071654e-05, + "loss": 0.0841, + "step": 42070 + }, + { + "epoch": 2.7530258423290808, + "grad_norm": 0.6476520299911499, + "learning_rate": 9.104749712468207e-05, + "loss": 0.0813, + "step": 42080 + }, + { + "epoch": 2.7536800785083413, + "grad_norm": 0.8619006276130676, + "learning_rate": 9.104225125673809e-05, + "loss": 0.08, + "step": 42090 + }, + { + "epoch": 2.7543343146876023, + "grad_norm": 0.7804111242294312, + "learning_rate": 9.103700400351047e-05, + "loss": 0.0889, + "step": 42100 + }, + { + "epoch": 2.754988550866863, + "grad_norm": 0.8227022290229797, + "learning_rate": 9.103175536517634e-05, + "loss": 0.08, + "step": 42110 + }, + { + "epoch": 2.7556427870461238, + "grad_norm": 0.8172056674957275, + "learning_rate": 9.102650534191287e-05, + "loss": 0.0863, + "step": 42120 + }, + { + "epoch": 2.7562970232253843, + "grad_norm": 0.8061167001724243, + "learning_rate": 9.102125393389725e-05, + "loss": 0.0707, + "step": 42130 + }, + { + "epoch": 2.7569512594046452, + "grad_norm": 0.9391945004463196, + "learning_rate": 9.10160011413067e-05, + "loss": 0.0777, + "step": 42140 + }, + { + "epoch": 2.7576054955839058, + "grad_norm": 1.0496068000793457, + "learning_rate": 9.101074696431858e-05, + "loss": 0.0854, + "step": 42150 + }, + { + "epoch": 2.7582597317631663, + "grad_norm": 0.8995298743247986, + "learning_rate": 9.100549140311017e-05, + "loss": 0.0802, + "step": 42160 + }, + { + "epoch": 2.7589139679424273, + "grad_norm": 0.8396767973899841, + "learning_rate": 9.100023445785889e-05, + "loss": 0.0736, + "step": 42170 + }, + { + "epoch": 2.759568204121688, + "grad_norm": 1.035605549812317, + "learning_rate": 9.099497612874217e-05, + "loss": 0.079, + "step": 42180 + }, + { + "epoch": 2.7602224403009488, + "grad_norm": 0.8803637027740479, + "learning_rate": 9.09897164159375e-05, + "loss": 0.0805, + "step": 42190 + }, + { + "epoch": 2.7608766764802093, + "grad_norm": 0.6873325109481812, + "learning_rate": 9.09844553196224e-05, + "loss": 0.0776, + "step": 42200 + }, + { + "epoch": 2.7615309126594703, + "grad_norm": 0.7511876225471497, + "learning_rate": 9.097919283997444e-05, + "loss": 0.0746, + "step": 42210 + }, + { + "epoch": 2.762185148838731, + "grad_norm": 0.7745641469955444, + "learning_rate": 9.097392897717126e-05, + "loss": 0.0829, + "step": 42220 + }, + { + "epoch": 2.7628393850179913, + "grad_norm": 0.8654570579528809, + "learning_rate": 9.09686637313905e-05, + "loss": 0.0783, + "step": 42230 + }, + { + "epoch": 2.7634936211972523, + "grad_norm": 0.8436213731765747, + "learning_rate": 9.09633971028099e-05, + "loss": 0.0826, + "step": 42240 + }, + { + "epoch": 2.764147857376513, + "grad_norm": 0.8295747637748718, + "learning_rate": 9.095812909160724e-05, + "loss": 0.0761, + "step": 42250 + }, + { + "epoch": 2.7648020935557738, + "grad_norm": 0.8413954377174377, + "learning_rate": 9.095285969796027e-05, + "loss": 0.0794, + "step": 42260 + }, + { + "epoch": 2.7654563297350343, + "grad_norm": 0.8630293607711792, + "learning_rate": 9.094758892204691e-05, + "loss": 0.0864, + "step": 42270 + }, + { + "epoch": 2.7661105659142953, + "grad_norm": 1.0896077156066895, + "learning_rate": 9.094231676404503e-05, + "loss": 0.0755, + "step": 42280 + }, + { + "epoch": 2.766764802093556, + "grad_norm": 0.9205356240272522, + "learning_rate": 9.093704322413259e-05, + "loss": 0.0888, + "step": 42290 + }, + { + "epoch": 2.7674190382728163, + "grad_norm": 0.8817057013511658, + "learning_rate": 9.093176830248756e-05, + "loss": 0.0755, + "step": 42300 + }, + { + "epoch": 2.7680732744520773, + "grad_norm": 0.9554494619369507, + "learning_rate": 9.092649199928802e-05, + "loss": 0.0742, + "step": 42310 + }, + { + "epoch": 2.768727510631338, + "grad_norm": 1.0500036478042603, + "learning_rate": 9.092121431471203e-05, + "loss": 0.0775, + "step": 42320 + }, + { + "epoch": 2.769381746810599, + "grad_norm": 1.2276242971420288, + "learning_rate": 9.091593524893776e-05, + "loss": 0.0775, + "step": 42330 + }, + { + "epoch": 2.7700359829898593, + "grad_norm": 0.8922539353370667, + "learning_rate": 9.091065480214333e-05, + "loss": 0.0841, + "step": 42340 + }, + { + "epoch": 2.7706902191691203, + "grad_norm": 0.9390623569488525, + "learning_rate": 9.090537297450703e-05, + "loss": 0.0816, + "step": 42350 + }, + { + "epoch": 2.771344455348381, + "grad_norm": 0.7786974310874939, + "learning_rate": 9.090008976620712e-05, + "loss": 0.0739, + "step": 42360 + }, + { + "epoch": 2.7719986915276413, + "grad_norm": 0.7234275341033936, + "learning_rate": 9.08948051774219e-05, + "loss": 0.077, + "step": 42370 + }, + { + "epoch": 2.7726529277069023, + "grad_norm": 0.8942136764526367, + "learning_rate": 9.088951920832978e-05, + "loss": 0.0838, + "step": 42380 + }, + { + "epoch": 2.773307163886163, + "grad_norm": 0.9873566031455994, + "learning_rate": 9.088423185910912e-05, + "loss": 0.0831, + "step": 42390 + }, + { + "epoch": 2.7739614000654234, + "grad_norm": 0.9224919080734253, + "learning_rate": 9.087894312993844e-05, + "loss": 0.0798, + "step": 42400 + }, + { + "epoch": 2.7746156362446843, + "grad_norm": 0.9617442488670349, + "learning_rate": 9.08736530209962e-05, + "loss": 0.082, + "step": 42410 + }, + { + "epoch": 2.7752698724239453, + "grad_norm": 0.9834263324737549, + "learning_rate": 9.086836153246099e-05, + "loss": 0.0846, + "step": 42420 + }, + { + "epoch": 2.775924108603206, + "grad_norm": 0.8556174635887146, + "learning_rate": 9.086306866451139e-05, + "loss": 0.0863, + "step": 42430 + }, + { + "epoch": 2.7765783447824663, + "grad_norm": 0.8653731346130371, + "learning_rate": 9.085777441732606e-05, + "loss": 0.0901, + "step": 42440 + }, + { + "epoch": 2.7772325809617273, + "grad_norm": 1.0214253664016724, + "learning_rate": 9.085247879108367e-05, + "loss": 0.0748, + "step": 42450 + }, + { + "epoch": 2.777886817140988, + "grad_norm": 0.7947637438774109, + "learning_rate": 9.084718178596301e-05, + "loss": 0.0846, + "step": 42460 + }, + { + "epoch": 2.7785410533202484, + "grad_norm": 0.9689314961433411, + "learning_rate": 9.084188340214281e-05, + "loss": 0.0747, + "step": 42470 + }, + { + "epoch": 2.7791952894995093, + "grad_norm": 0.9164670705795288, + "learning_rate": 9.083658363980196e-05, + "loss": 0.0836, + "step": 42480 + }, + { + "epoch": 2.7798495256787703, + "grad_norm": 1.0349498987197876, + "learning_rate": 9.08312824991193e-05, + "loss": 0.0858, + "step": 42490 + }, + { + "epoch": 2.780503761858031, + "grad_norm": 0.8005316257476807, + "learning_rate": 9.082597998027377e-05, + "loss": 0.0791, + "step": 42500 + }, + { + "epoch": 2.7811579980372914, + "grad_norm": 0.7661714553833008, + "learning_rate": 9.082067608344436e-05, + "loss": 0.0844, + "step": 42510 + }, + { + "epoch": 2.7818122342165523, + "grad_norm": 0.9406249523162842, + "learning_rate": 9.081537080881007e-05, + "loss": 0.0822, + "step": 42520 + }, + { + "epoch": 2.782466470395813, + "grad_norm": 0.8644420504570007, + "learning_rate": 9.081006415654995e-05, + "loss": 0.0803, + "step": 42530 + }, + { + "epoch": 2.7831207065750734, + "grad_norm": 0.9211861491203308, + "learning_rate": 9.080475612684316e-05, + "loss": 0.0808, + "step": 42540 + }, + { + "epoch": 2.7837749427543343, + "grad_norm": 1.1344019174575806, + "learning_rate": 9.079944671986883e-05, + "loss": 0.0834, + "step": 42550 + }, + { + "epoch": 2.784429178933595, + "grad_norm": 0.8112305998802185, + "learning_rate": 9.079413593580616e-05, + "loss": 0.0936, + "step": 42560 + }, + { + "epoch": 2.785083415112856, + "grad_norm": 0.973592221736908, + "learning_rate": 9.078882377483444e-05, + "loss": 0.0733, + "step": 42570 + }, + { + "epoch": 2.7857376512921164, + "grad_norm": 0.7716719508171082, + "learning_rate": 9.078351023713294e-05, + "loss": 0.0764, + "step": 42580 + }, + { + "epoch": 2.7863918874713773, + "grad_norm": 0.7506532073020935, + "learning_rate": 9.077819532288102e-05, + "loss": 0.0697, + "step": 42590 + }, + { + "epoch": 2.787046123650638, + "grad_norm": 0.7960049510002136, + "learning_rate": 9.077287903225804e-05, + "loss": 0.0835, + "step": 42600 + }, + { + "epoch": 2.7877003598298984, + "grad_norm": 0.873766303062439, + "learning_rate": 9.076756136544346e-05, + "loss": 0.076, + "step": 42610 + }, + { + "epoch": 2.7883545960091594, + "grad_norm": 0.680327296257019, + "learning_rate": 9.076224232261679e-05, + "loss": 0.0727, + "step": 42620 + }, + { + "epoch": 2.78900883218842, + "grad_norm": 0.9833268523216248, + "learning_rate": 9.075692190395752e-05, + "loss": 0.0864, + "step": 42630 + }, + { + "epoch": 2.789663068367681, + "grad_norm": 0.9118520617485046, + "learning_rate": 9.075160010964526e-05, + "loss": 0.079, + "step": 42640 + }, + { + "epoch": 2.7903173045469414, + "grad_norm": 0.990631639957428, + "learning_rate": 9.074627693985961e-05, + "loss": 0.0777, + "step": 42650 + }, + { + "epoch": 2.7909715407262023, + "grad_norm": 0.91771399974823, + "learning_rate": 9.074095239478026e-05, + "loss": 0.0775, + "step": 42660 + }, + { + "epoch": 2.791625776905463, + "grad_norm": 0.832398533821106, + "learning_rate": 9.07356264745869e-05, + "loss": 0.0769, + "step": 42670 + }, + { + "epoch": 2.7922800130847234, + "grad_norm": 0.7836474180221558, + "learning_rate": 9.073029917945934e-05, + "loss": 0.087, + "step": 42680 + }, + { + "epoch": 2.7929342492639844, + "grad_norm": 0.8258154988288879, + "learning_rate": 9.072497050957736e-05, + "loss": 0.0893, + "step": 42690 + }, + { + "epoch": 2.793588485443245, + "grad_norm": 0.8014414310455322, + "learning_rate": 9.071964046512081e-05, + "loss": 0.0776, + "step": 42700 + }, + { + "epoch": 2.794242721622506, + "grad_norm": 0.9521371126174927, + "learning_rate": 9.071430904626961e-05, + "loss": 0.0834, + "step": 42710 + }, + { + "epoch": 2.7948969578017664, + "grad_norm": 1.008519172668457, + "learning_rate": 9.07089762532037e-05, + "loss": 0.0725, + "step": 42720 + }, + { + "epoch": 2.7955511939810274, + "grad_norm": 1.2083194255828857, + "learning_rate": 9.070364208610307e-05, + "loss": 0.0781, + "step": 42730 + }, + { + "epoch": 2.796205430160288, + "grad_norm": 1.0434963703155518, + "learning_rate": 9.069830654514778e-05, + "loss": 0.0841, + "step": 42740 + }, + { + "epoch": 2.7968596663395484, + "grad_norm": 1.085412859916687, + "learning_rate": 9.06929696305179e-05, + "loss": 0.0919, + "step": 42750 + }, + { + "epoch": 2.7975139025188094, + "grad_norm": 0.9131748676300049, + "learning_rate": 9.06876313423936e-05, + "loss": 0.0799, + "step": 42760 + }, + { + "epoch": 2.79816813869807, + "grad_norm": 0.977798581123352, + "learning_rate": 9.068229168095501e-05, + "loss": 0.0895, + "step": 42770 + }, + { + "epoch": 2.798822374877331, + "grad_norm": 0.891613245010376, + "learning_rate": 9.067695064638237e-05, + "loss": 0.0834, + "step": 42780 + }, + { + "epoch": 2.7994766110565914, + "grad_norm": 0.8886749148368835, + "learning_rate": 9.0671608238856e-05, + "loss": 0.0821, + "step": 42790 + }, + { + "epoch": 2.8001308472358524, + "grad_norm": 0.9983289241790771, + "learning_rate": 9.066626445855617e-05, + "loss": 0.0828, + "step": 42800 + }, + { + "epoch": 2.800785083415113, + "grad_norm": 0.7890607118606567, + "learning_rate": 9.066091930566327e-05, + "loss": 0.0857, + "step": 42810 + }, + { + "epoch": 2.8014393195943734, + "grad_norm": 0.7882013320922852, + "learning_rate": 9.06555727803577e-05, + "loss": 0.0775, + "step": 42820 + }, + { + "epoch": 2.8020935557736344, + "grad_norm": 0.8657622337341309, + "learning_rate": 9.065022488281992e-05, + "loss": 0.0844, + "step": 42830 + }, + { + "epoch": 2.802747791952895, + "grad_norm": 1.038025975227356, + "learning_rate": 9.064487561323045e-05, + "loss": 0.0796, + "step": 42840 + }, + { + "epoch": 2.8034020281321554, + "grad_norm": 0.8643264174461365, + "learning_rate": 9.063952497176983e-05, + "loss": 0.0823, + "step": 42850 + }, + { + "epoch": 2.8040562643114164, + "grad_norm": 0.7769449949264526, + "learning_rate": 9.063417295861866e-05, + "loss": 0.07, + "step": 42860 + }, + { + "epoch": 2.8047105004906774, + "grad_norm": 0.988023579120636, + "learning_rate": 9.06288195739576e-05, + "loss": 0.0752, + "step": 42870 + }, + { + "epoch": 2.805364736669938, + "grad_norm": 0.7536209225654602, + "learning_rate": 9.06234648179673e-05, + "loss": 0.0876, + "step": 42880 + }, + { + "epoch": 2.8060189728491984, + "grad_norm": 0.9088309407234192, + "learning_rate": 9.061810869082855e-05, + "loss": 0.0749, + "step": 42890 + }, + { + "epoch": 2.8066732090284594, + "grad_norm": 0.937651515007019, + "learning_rate": 9.061275119272207e-05, + "loss": 0.0868, + "step": 42900 + }, + { + "epoch": 2.80732744520772, + "grad_norm": 0.8864895701408386, + "learning_rate": 9.060739232382876e-05, + "loss": 0.0844, + "step": 42910 + }, + { + "epoch": 2.8079816813869805, + "grad_norm": 0.8680556416511536, + "learning_rate": 9.060203208432945e-05, + "loss": 0.0741, + "step": 42920 + }, + { + "epoch": 2.8086359175662414, + "grad_norm": 0.7065665125846863, + "learning_rate": 9.059667047440508e-05, + "loss": 0.0766, + "step": 42930 + }, + { + "epoch": 2.8092901537455024, + "grad_norm": 1.2820072174072266, + "learning_rate": 9.059130749423662e-05, + "loss": 0.0857, + "step": 42940 + }, + { + "epoch": 2.809944389924763, + "grad_norm": 0.8697900772094727, + "learning_rate": 9.058594314400506e-05, + "loss": 0.0755, + "step": 42950 + }, + { + "epoch": 2.8105986261040234, + "grad_norm": 0.9628492593765259, + "learning_rate": 9.058057742389147e-05, + "loss": 0.0819, + "step": 42960 + }, + { + "epoch": 2.8112528622832844, + "grad_norm": 1.09038507938385, + "learning_rate": 9.057521033407698e-05, + "loss": 0.0815, + "step": 42970 + }, + { + "epoch": 2.811907098462545, + "grad_norm": 0.9920393824577332, + "learning_rate": 9.056984187474275e-05, + "loss": 0.081, + "step": 42980 + }, + { + "epoch": 2.8125613346418055, + "grad_norm": 0.7963695526123047, + "learning_rate": 9.056447204606993e-05, + "loss": 0.0774, + "step": 42990 + }, + { + "epoch": 2.8132155708210664, + "grad_norm": 1.1177492141723633, + "learning_rate": 9.055910084823979e-05, + "loss": 0.0786, + "step": 43000 + }, + { + "epoch": 2.813869807000327, + "grad_norm": 0.9136730432510376, + "learning_rate": 9.055372828143365e-05, + "loss": 0.0744, + "step": 43010 + }, + { + "epoch": 2.814524043179588, + "grad_norm": 0.7779806852340698, + "learning_rate": 9.054835434583281e-05, + "loss": 0.0788, + "step": 43020 + }, + { + "epoch": 2.8151782793588485, + "grad_norm": 1.0529919862747192, + "learning_rate": 9.054297904161868e-05, + "loss": 0.0788, + "step": 43030 + }, + { + "epoch": 2.8158325155381094, + "grad_norm": 0.7755911946296692, + "learning_rate": 9.053760236897266e-05, + "loss": 0.08, + "step": 43040 + }, + { + "epoch": 2.81648675171737, + "grad_norm": 0.9372928738594055, + "learning_rate": 9.053222432807626e-05, + "loss": 0.0827, + "step": 43050 + }, + { + "epoch": 2.8171409878966305, + "grad_norm": 0.91545170545578, + "learning_rate": 9.052684491911099e-05, + "loss": 0.0858, + "step": 43060 + }, + { + "epoch": 2.8177952240758914, + "grad_norm": 0.7959693670272827, + "learning_rate": 9.052146414225841e-05, + "loss": 0.0906, + "step": 43070 + }, + { + "epoch": 2.818449460255152, + "grad_norm": 0.9491099119186401, + "learning_rate": 9.051608199770016e-05, + "loss": 0.0794, + "step": 43080 + }, + { + "epoch": 2.819103696434413, + "grad_norm": 0.7243504524230957, + "learning_rate": 9.051069848561787e-05, + "loss": 0.0793, + "step": 43090 + }, + { + "epoch": 2.8197579326136735, + "grad_norm": 1.0417028665542603, + "learning_rate": 9.050531360619328e-05, + "loss": 0.0791, + "step": 43100 + }, + { + "epoch": 2.8204121687929344, + "grad_norm": 0.7411209344863892, + "learning_rate": 9.04999273596081e-05, + "loss": 0.0815, + "step": 43110 + }, + { + "epoch": 2.821066404972195, + "grad_norm": 0.7973235249519348, + "learning_rate": 9.049453974604418e-05, + "loss": 0.0752, + "step": 43120 + }, + { + "epoch": 2.8217206411514555, + "grad_norm": 0.9462906718254089, + "learning_rate": 9.048915076568334e-05, + "loss": 0.0808, + "step": 43130 + }, + { + "epoch": 2.8223748773307165, + "grad_norm": 1.0456901788711548, + "learning_rate": 9.048376041870745e-05, + "loss": 0.0969, + "step": 43140 + }, + { + "epoch": 2.823029113509977, + "grad_norm": 1.11058509349823, + "learning_rate": 9.04783687052985e-05, + "loss": 0.0767, + "step": 43150 + }, + { + "epoch": 2.823683349689238, + "grad_norm": 0.7125511765480042, + "learning_rate": 9.047297562563843e-05, + "loss": 0.0781, + "step": 43160 + }, + { + "epoch": 2.8243375858684985, + "grad_norm": 0.7025366425514221, + "learning_rate": 9.04675811799093e-05, + "loss": 0.0664, + "step": 43170 + }, + { + "epoch": 2.8249918220477594, + "grad_norm": 0.9043560028076172, + "learning_rate": 9.046218536829319e-05, + "loss": 0.077, + "step": 43180 + }, + { + "epoch": 2.82564605822702, + "grad_norm": 0.7520603537559509, + "learning_rate": 9.045678819097218e-05, + "loss": 0.0734, + "step": 43190 + }, + { + "epoch": 2.8263002944062805, + "grad_norm": 0.903668224811554, + "learning_rate": 9.045138964812848e-05, + "loss": 0.0824, + "step": 43200 + }, + { + "epoch": 2.8269545305855415, + "grad_norm": 0.8586761355400085, + "learning_rate": 9.044598973994429e-05, + "loss": 0.0787, + "step": 43210 + }, + { + "epoch": 2.827608766764802, + "grad_norm": 0.8279673457145691, + "learning_rate": 9.044058846660187e-05, + "loss": 0.0903, + "step": 43220 + }, + { + "epoch": 2.828263002944063, + "grad_norm": 0.7114253640174866, + "learning_rate": 9.043518582828354e-05, + "loss": 0.0761, + "step": 43230 + }, + { + "epoch": 2.8289172391233235, + "grad_norm": 0.8194983601570129, + "learning_rate": 9.042978182517163e-05, + "loss": 0.0902, + "step": 43240 + }, + { + "epoch": 2.8295714753025845, + "grad_norm": 0.7969952821731567, + "learning_rate": 9.042437645744856e-05, + "loss": 0.0691, + "step": 43250 + }, + { + "epoch": 2.830225711481845, + "grad_norm": 0.7822580337524414, + "learning_rate": 9.041896972529677e-05, + "loss": 0.0752, + "step": 43260 + }, + { + "epoch": 2.8308799476611055, + "grad_norm": 0.7220231294631958, + "learning_rate": 9.041356162889873e-05, + "loss": 0.0728, + "step": 43270 + }, + { + "epoch": 2.8315341838403665, + "grad_norm": 0.7740190029144287, + "learning_rate": 9.040815216843702e-05, + "loss": 0.0816, + "step": 43280 + }, + { + "epoch": 2.832188420019627, + "grad_norm": 0.8419371247291565, + "learning_rate": 9.040274134409419e-05, + "loss": 0.0792, + "step": 43290 + }, + { + "epoch": 2.8328426561988875, + "grad_norm": 0.950294554233551, + "learning_rate": 9.039732915605287e-05, + "loss": 0.0856, + "step": 43300 + }, + { + "epoch": 2.8334968923781485, + "grad_norm": 0.9349603056907654, + "learning_rate": 9.039191560449575e-05, + "loss": 0.0815, + "step": 43310 + }, + { + "epoch": 2.8341511285574095, + "grad_norm": 0.8912672996520996, + "learning_rate": 9.038650068960556e-05, + "loss": 0.0821, + "step": 43320 + }, + { + "epoch": 2.83480536473667, + "grad_norm": 0.9765848517417908, + "learning_rate": 9.038108441156506e-05, + "loss": 0.0797, + "step": 43330 + }, + { + "epoch": 2.8354596009159305, + "grad_norm": 0.8395116329193115, + "learning_rate": 9.037566677055705e-05, + "loss": 0.0738, + "step": 43340 + }, + { + "epoch": 2.8361138370951915, + "grad_norm": 0.7863714098930359, + "learning_rate": 9.037024776676438e-05, + "loss": 0.087, + "step": 43350 + }, + { + "epoch": 2.836768073274452, + "grad_norm": 0.9483030438423157, + "learning_rate": 9.036482740036998e-05, + "loss": 0.0786, + "step": 43360 + }, + { + "epoch": 2.8374223094537125, + "grad_norm": 0.8250223994255066, + "learning_rate": 9.03594056715568e-05, + "loss": 0.0719, + "step": 43370 + }, + { + "epoch": 2.8380765456329735, + "grad_norm": 0.8315423727035522, + "learning_rate": 9.035398258050784e-05, + "loss": 0.0811, + "step": 43380 + }, + { + "epoch": 2.8387307818122345, + "grad_norm": 0.8926718235015869, + "learning_rate": 9.034855812740612e-05, + "loss": 0.0749, + "step": 43390 + }, + { + "epoch": 2.839385017991495, + "grad_norm": 0.8397383689880371, + "learning_rate": 9.034313231243477e-05, + "loss": 0.0738, + "step": 43400 + }, + { + "epoch": 2.8400392541707555, + "grad_norm": 0.900603711605072, + "learning_rate": 9.033770513577688e-05, + "loss": 0.0747, + "step": 43410 + }, + { + "epoch": 2.8406934903500165, + "grad_norm": 0.8045016527175903, + "learning_rate": 9.033227659761565e-05, + "loss": 0.0794, + "step": 43420 + }, + { + "epoch": 2.841347726529277, + "grad_norm": 0.861449122428894, + "learning_rate": 9.032684669813431e-05, + "loss": 0.0744, + "step": 43430 + }, + { + "epoch": 2.8420019627085376, + "grad_norm": 0.8809171319007874, + "learning_rate": 9.032141543751614e-05, + "loss": 0.0807, + "step": 43440 + }, + { + "epoch": 2.8426561988877985, + "grad_norm": 0.8424410820007324, + "learning_rate": 9.031598281594445e-05, + "loss": 0.0771, + "step": 43450 + }, + { + "epoch": 2.843310435067059, + "grad_norm": 0.788705587387085, + "learning_rate": 9.031054883360261e-05, + "loss": 0.0815, + "step": 43460 + }, + { + "epoch": 2.84396467124632, + "grad_norm": 0.7892941832542419, + "learning_rate": 9.030511349067404e-05, + "loss": 0.0707, + "step": 43470 + }, + { + "epoch": 2.8446189074255805, + "grad_norm": 0.8143117427825928, + "learning_rate": 9.029967678734216e-05, + "loss": 0.0738, + "step": 43480 + }, + { + "epoch": 2.8452731436048415, + "grad_norm": 0.9846142530441284, + "learning_rate": 9.02942387237905e-05, + "loss": 0.0776, + "step": 43490 + }, + { + "epoch": 2.845927379784102, + "grad_norm": 0.9837608933448792, + "learning_rate": 9.028879930020262e-05, + "loss": 0.0807, + "step": 43500 + }, + { + "epoch": 2.8465816159633626, + "grad_norm": 1.0797531604766846, + "learning_rate": 9.028335851676211e-05, + "loss": 0.0818, + "step": 43510 + }, + { + "epoch": 2.8472358521426235, + "grad_norm": 0.9254376292228699, + "learning_rate": 9.02779163736526e-05, + "loss": 0.0763, + "step": 43520 + }, + { + "epoch": 2.847890088321884, + "grad_norm": 0.9471592903137207, + "learning_rate": 9.027247287105776e-05, + "loss": 0.0708, + "step": 43530 + }, + { + "epoch": 2.848544324501145, + "grad_norm": 0.9006256461143494, + "learning_rate": 9.026702800916136e-05, + "loss": 0.0736, + "step": 43540 + }, + { + "epoch": 2.8491985606804056, + "grad_norm": 0.888878345489502, + "learning_rate": 9.026158178814715e-05, + "loss": 0.0706, + "step": 43550 + }, + { + "epoch": 2.8498527968596665, + "grad_norm": 0.9179434180259705, + "learning_rate": 9.025613420819897e-05, + "loss": 0.076, + "step": 43560 + }, + { + "epoch": 2.850507033038927, + "grad_norm": 0.9074091911315918, + "learning_rate": 9.025068526950069e-05, + "loss": 0.0775, + "step": 43570 + }, + { + "epoch": 2.8511612692181876, + "grad_norm": 0.7732144594192505, + "learning_rate": 9.024523497223622e-05, + "loss": 0.077, + "step": 43580 + }, + { + "epoch": 2.8518155053974485, + "grad_norm": 0.8621454834938049, + "learning_rate": 9.02397833165895e-05, + "loss": 0.0852, + "step": 43590 + }, + { + "epoch": 2.852469741576709, + "grad_norm": 0.9191235303878784, + "learning_rate": 9.023433030274459e-05, + "loss": 0.0815, + "step": 43600 + }, + { + "epoch": 2.85312397775597, + "grad_norm": 0.7941545248031616, + "learning_rate": 9.022887593088551e-05, + "loss": 0.0837, + "step": 43610 + }, + { + "epoch": 2.8537782139352306, + "grad_norm": 1.0019004344940186, + "learning_rate": 9.022342020119637e-05, + "loss": 0.0839, + "step": 43620 + }, + { + "epoch": 2.8544324501144915, + "grad_norm": 1.2812579870224, + "learning_rate": 9.021796311386128e-05, + "loss": 0.0911, + "step": 43630 + }, + { + "epoch": 2.855086686293752, + "grad_norm": 0.9368396997451782, + "learning_rate": 9.021250466906448e-05, + "loss": 0.0748, + "step": 43640 + }, + { + "epoch": 2.8557409224730126, + "grad_norm": 0.8556075096130371, + "learning_rate": 9.020704486699018e-05, + "loss": 0.0798, + "step": 43650 + }, + { + "epoch": 2.8563951586522736, + "grad_norm": 0.6791936755180359, + "learning_rate": 9.020158370782266e-05, + "loss": 0.0834, + "step": 43660 + }, + { + "epoch": 2.857049394831534, + "grad_norm": 0.7594597339630127, + "learning_rate": 9.019612119174627e-05, + "loss": 0.0826, + "step": 43670 + }, + { + "epoch": 2.857703631010795, + "grad_norm": 0.8290628790855408, + "learning_rate": 9.019065731894537e-05, + "loss": 0.0792, + "step": 43680 + }, + { + "epoch": 2.8583578671900556, + "grad_norm": 0.8027207255363464, + "learning_rate": 9.018519208960439e-05, + "loss": 0.0695, + "step": 43690 + }, + { + "epoch": 2.8590121033693165, + "grad_norm": 0.8073681592941284, + "learning_rate": 9.017972550390777e-05, + "loss": 0.0823, + "step": 43700 + }, + { + "epoch": 2.859666339548577, + "grad_norm": 0.971709668636322, + "learning_rate": 9.017425756204005e-05, + "loss": 0.0919, + "step": 43710 + }, + { + "epoch": 2.8603205757278376, + "grad_norm": 0.8581392765045166, + "learning_rate": 9.016878826418577e-05, + "loss": 0.0817, + "step": 43720 + }, + { + "epoch": 2.8609748119070986, + "grad_norm": 0.8417885303497314, + "learning_rate": 9.016331761052954e-05, + "loss": 0.0865, + "step": 43730 + }, + { + "epoch": 2.861629048086359, + "grad_norm": 1.0083153247833252, + "learning_rate": 9.015784560125602e-05, + "loss": 0.077, + "step": 43740 + }, + { + "epoch": 2.8622832842656196, + "grad_norm": 0.8471361398696899, + "learning_rate": 9.015237223654988e-05, + "loss": 0.0737, + "step": 43750 + }, + { + "epoch": 2.8629375204448806, + "grad_norm": 0.8287156224250793, + "learning_rate": 9.014689751659589e-05, + "loss": 0.0779, + "step": 43760 + }, + { + "epoch": 2.8635917566241416, + "grad_norm": 0.8068413734436035, + "learning_rate": 9.01414214415788e-05, + "loss": 0.0885, + "step": 43770 + }, + { + "epoch": 2.864245992803402, + "grad_norm": 0.8625628352165222, + "learning_rate": 9.013594401168346e-05, + "loss": 0.0933, + "step": 43780 + }, + { + "epoch": 2.8649002289826626, + "grad_norm": 0.9650261998176575, + "learning_rate": 9.013046522709477e-05, + "loss": 0.0869, + "step": 43790 + }, + { + "epoch": 2.8655544651619236, + "grad_norm": 0.7974777817726135, + "learning_rate": 9.01249850879976e-05, + "loss": 0.0733, + "step": 43800 + }, + { + "epoch": 2.866208701341184, + "grad_norm": 1.0560264587402344, + "learning_rate": 9.011950359457699e-05, + "loss": 0.0885, + "step": 43810 + }, + { + "epoch": 2.8668629375204446, + "grad_norm": 0.8955875635147095, + "learning_rate": 9.011402074701788e-05, + "loss": 0.0829, + "step": 43820 + }, + { + "epoch": 2.8675171736997056, + "grad_norm": 0.9224720001220703, + "learning_rate": 9.01085365455054e-05, + "loss": 0.077, + "step": 43830 + }, + { + "epoch": 2.8681714098789666, + "grad_norm": 0.9114225506782532, + "learning_rate": 9.010305099022462e-05, + "loss": 0.0843, + "step": 43840 + }, + { + "epoch": 2.868825646058227, + "grad_norm": 0.8845502138137817, + "learning_rate": 9.009756408136067e-05, + "loss": 0.0749, + "step": 43850 + }, + { + "epoch": 2.8694798822374876, + "grad_norm": 0.8369585275650024, + "learning_rate": 9.009207581909879e-05, + "loss": 0.0707, + "step": 43860 + }, + { + "epoch": 2.8701341184167486, + "grad_norm": 0.8531777262687683, + "learning_rate": 9.008658620362421e-05, + "loss": 0.0816, + "step": 43870 + }, + { + "epoch": 2.870788354596009, + "grad_norm": 0.9684780836105347, + "learning_rate": 9.00810952351222e-05, + "loss": 0.0852, + "step": 43880 + }, + { + "epoch": 2.8714425907752696, + "grad_norm": 1.1896883249282837, + "learning_rate": 9.007560291377813e-05, + "loss": 0.0782, + "step": 43890 + }, + { + "epoch": 2.8720968269545306, + "grad_norm": 0.8664726614952087, + "learning_rate": 9.007010923977732e-05, + "loss": 0.0761, + "step": 43900 + }, + { + "epoch": 2.872751063133791, + "grad_norm": 0.7793802618980408, + "learning_rate": 9.006461421330528e-05, + "loss": 0.0801, + "step": 43910 + }, + { + "epoch": 2.873405299313052, + "grad_norm": 0.8414815664291382, + "learning_rate": 9.005911783454742e-05, + "loss": 0.0774, + "step": 43920 + }, + { + "epoch": 2.8740595354923126, + "grad_norm": 0.9556751251220703, + "learning_rate": 9.005362010368926e-05, + "loss": 0.0771, + "step": 43930 + }, + { + "epoch": 2.8747137716715736, + "grad_norm": 0.720085620880127, + "learning_rate": 9.00481210209164e-05, + "loss": 0.077, + "step": 43940 + }, + { + "epoch": 2.875368007850834, + "grad_norm": 0.7043220400810242, + "learning_rate": 9.004262058641441e-05, + "loss": 0.0704, + "step": 43950 + }, + { + "epoch": 2.8760222440300947, + "grad_norm": 0.9286543726921082, + "learning_rate": 9.003711880036896e-05, + "loss": 0.0778, + "step": 43960 + }, + { + "epoch": 2.8766764802093556, + "grad_norm": 0.8943794369697571, + "learning_rate": 9.003161566296576e-05, + "loss": 0.0684, + "step": 43970 + }, + { + "epoch": 2.877330716388616, + "grad_norm": 0.7501460313796997, + "learning_rate": 9.002611117439054e-05, + "loss": 0.0767, + "step": 43980 + }, + { + "epoch": 2.877984952567877, + "grad_norm": 0.9430428743362427, + "learning_rate": 9.002060533482909e-05, + "loss": 0.0751, + "step": 43990 + }, + { + "epoch": 2.8786391887471376, + "grad_norm": 0.986933171749115, + "learning_rate": 9.001509814446726e-05, + "loss": 0.0842, + "step": 44000 + }, + { + "epoch": 2.8792934249263986, + "grad_norm": 0.7722887992858887, + "learning_rate": 9.000958960349092e-05, + "loss": 0.0782, + "step": 44010 + }, + { + "epoch": 2.879947661105659, + "grad_norm": 0.8584244251251221, + "learning_rate": 9.0004079712086e-05, + "loss": 0.0796, + "step": 44020 + }, + { + "epoch": 2.8806018972849197, + "grad_norm": 0.8732749819755554, + "learning_rate": 8.999856847043849e-05, + "loss": 0.0848, + "step": 44030 + }, + { + "epoch": 2.8812561334641806, + "grad_norm": 0.8751747012138367, + "learning_rate": 8.999305587873437e-05, + "loss": 0.0776, + "step": 44040 + }, + { + "epoch": 2.881910369643441, + "grad_norm": 0.9108805060386658, + "learning_rate": 8.998754193715974e-05, + "loss": 0.0745, + "step": 44050 + }, + { + "epoch": 2.882564605822702, + "grad_norm": 0.9251412749290466, + "learning_rate": 8.99820266459007e-05, + "loss": 0.0819, + "step": 44060 + }, + { + "epoch": 2.8832188420019627, + "grad_norm": 1.0415140390396118, + "learning_rate": 8.99765100051434e-05, + "loss": 0.072, + "step": 44070 + }, + { + "epoch": 2.8838730781812236, + "grad_norm": 0.8129945397377014, + "learning_rate": 8.997099201507406e-05, + "loss": 0.0788, + "step": 44080 + }, + { + "epoch": 2.884527314360484, + "grad_norm": 0.760057270526886, + "learning_rate": 8.996547267587889e-05, + "loss": 0.0876, + "step": 44090 + }, + { + "epoch": 2.8851815505397447, + "grad_norm": 0.9552372694015503, + "learning_rate": 8.995995198774421e-05, + "loss": 0.0968, + "step": 44100 + }, + { + "epoch": 2.8858357867190056, + "grad_norm": 0.7973808646202087, + "learning_rate": 8.995442995085636e-05, + "loss": 0.0824, + "step": 44110 + }, + { + "epoch": 2.886490022898266, + "grad_norm": 0.7297688722610474, + "learning_rate": 8.99489065654017e-05, + "loss": 0.0783, + "step": 44120 + }, + { + "epoch": 2.887144259077527, + "grad_norm": 0.9278412461280823, + "learning_rate": 8.994338183156669e-05, + "loss": 0.083, + "step": 44130 + }, + { + "epoch": 2.8877984952567877, + "grad_norm": 0.9200074672698975, + "learning_rate": 8.993785574953778e-05, + "loss": 0.0781, + "step": 44140 + }, + { + "epoch": 2.8884527314360486, + "grad_norm": 0.8482991456985474, + "learning_rate": 8.99323283195015e-05, + "loss": 0.0826, + "step": 44150 + }, + { + "epoch": 2.889106967615309, + "grad_norm": 0.7914182543754578, + "learning_rate": 8.992679954164442e-05, + "loss": 0.0751, + "step": 44160 + }, + { + "epoch": 2.8897612037945697, + "grad_norm": 0.8093661069869995, + "learning_rate": 8.992126941615313e-05, + "loss": 0.0754, + "step": 44170 + }, + { + "epoch": 2.8904154399738307, + "grad_norm": 0.8084908127784729, + "learning_rate": 8.99157379432143e-05, + "loss": 0.0813, + "step": 44180 + }, + { + "epoch": 2.891069676153091, + "grad_norm": 0.7260185480117798, + "learning_rate": 8.991020512301464e-05, + "loss": 0.0744, + "step": 44190 + }, + { + "epoch": 2.8917239123323517, + "grad_norm": 0.7913504838943481, + "learning_rate": 8.990467095574089e-05, + "loss": 0.0689, + "step": 44200 + }, + { + "epoch": 2.8923781485116127, + "grad_norm": 0.6776607632637024, + "learning_rate": 8.989913544157983e-05, + "loss": 0.0707, + "step": 44210 + }, + { + "epoch": 2.8930323846908736, + "grad_norm": 0.9057397246360779, + "learning_rate": 8.98935985807183e-05, + "loss": 0.0873, + "step": 44220 + }, + { + "epoch": 2.893686620870134, + "grad_norm": 0.9263217449188232, + "learning_rate": 8.988806037334322e-05, + "loss": 0.0689, + "step": 44230 + }, + { + "epoch": 2.8943408570493947, + "grad_norm": 0.9351694583892822, + "learning_rate": 8.988252081964147e-05, + "loss": 0.0806, + "step": 44240 + }, + { + "epoch": 2.8949950932286557, + "grad_norm": 0.75326007604599, + "learning_rate": 8.987697991980007e-05, + "loss": 0.0745, + "step": 44250 + }, + { + "epoch": 2.895649329407916, + "grad_norm": 1.0243571996688843, + "learning_rate": 8.987143767400601e-05, + "loss": 0.0782, + "step": 44260 + }, + { + "epoch": 2.8963035655871767, + "grad_norm": 0.8010449409484863, + "learning_rate": 8.986589408244634e-05, + "loss": 0.0745, + "step": 44270 + }, + { + "epoch": 2.8969578017664377, + "grad_norm": 0.9779613614082336, + "learning_rate": 8.986034914530823e-05, + "loss": 0.0723, + "step": 44280 + }, + { + "epoch": 2.8976120379456987, + "grad_norm": 0.8159413933753967, + "learning_rate": 8.985480286277877e-05, + "loss": 0.0781, + "step": 44290 + }, + { + "epoch": 2.898266274124959, + "grad_norm": 0.8815252184867859, + "learning_rate": 8.984925523504519e-05, + "loss": 0.0839, + "step": 44300 + }, + { + "epoch": 2.8989205103042197, + "grad_norm": 0.7953386306762695, + "learning_rate": 8.984370626229474e-05, + "loss": 0.0898, + "step": 44310 + }, + { + "epoch": 2.8995747464834807, + "grad_norm": 0.8342095017433167, + "learning_rate": 8.983815594471472e-05, + "loss": 0.0854, + "step": 44320 + }, + { + "epoch": 2.900228982662741, + "grad_norm": 0.864380955696106, + "learning_rate": 8.983260428249246e-05, + "loss": 0.0869, + "step": 44330 + }, + { + "epoch": 2.9008832188420017, + "grad_norm": 0.9503182172775269, + "learning_rate": 8.982705127581533e-05, + "loss": 0.0722, + "step": 44340 + }, + { + "epoch": 2.9015374550212627, + "grad_norm": 0.8214649558067322, + "learning_rate": 8.982149692487078e-05, + "loss": 0.0772, + "step": 44350 + }, + { + "epoch": 2.9021916912005232, + "grad_norm": 0.8064943552017212, + "learning_rate": 8.981594122984627e-05, + "loss": 0.0757, + "step": 44360 + }, + { + "epoch": 2.902845927379784, + "grad_norm": 0.7172738909721375, + "learning_rate": 8.981038419092931e-05, + "loss": 0.0714, + "step": 44370 + }, + { + "epoch": 2.9035001635590447, + "grad_norm": 0.9183410406112671, + "learning_rate": 8.980482580830747e-05, + "loss": 0.0785, + "step": 44380 + }, + { + "epoch": 2.9041543997383057, + "grad_norm": 0.8381460905075073, + "learning_rate": 8.97992660821684e-05, + "loss": 0.0714, + "step": 44390 + }, + { + "epoch": 2.904808635917566, + "grad_norm": 0.8850300908088684, + "learning_rate": 8.979370501269971e-05, + "loss": 0.0808, + "step": 44400 + }, + { + "epoch": 2.9054628720968267, + "grad_norm": 0.848743736743927, + "learning_rate": 8.97881426000891e-05, + "loss": 0.0867, + "step": 44410 + }, + { + "epoch": 2.9061171082760877, + "grad_norm": 0.7968313097953796, + "learning_rate": 8.978257884452433e-05, + "loss": 0.0775, + "step": 44420 + }, + { + "epoch": 2.9067713444553482, + "grad_norm": 0.9209117293357849, + "learning_rate": 8.977701374619321e-05, + "loss": 0.0702, + "step": 44430 + }, + { + "epoch": 2.907425580634609, + "grad_norm": 0.7473788857460022, + "learning_rate": 8.977144730528353e-05, + "loss": 0.0786, + "step": 44440 + }, + { + "epoch": 2.9080798168138697, + "grad_norm": 0.7625455856323242, + "learning_rate": 8.97658795219832e-05, + "loss": 0.0802, + "step": 44450 + }, + { + "epoch": 2.9087340529931307, + "grad_norm": 1.0252678394317627, + "learning_rate": 8.976031039648017e-05, + "loss": 0.0686, + "step": 44460 + }, + { + "epoch": 2.9093882891723912, + "grad_norm": 0.7554779052734375, + "learning_rate": 8.975473992896239e-05, + "loss": 0.0744, + "step": 44470 + }, + { + "epoch": 2.9100425253516518, + "grad_norm": 0.8992921710014343, + "learning_rate": 8.974916811961786e-05, + "loss": 0.0847, + "step": 44480 + }, + { + "epoch": 2.9106967615309127, + "grad_norm": 0.7527303695678711, + "learning_rate": 8.974359496863466e-05, + "loss": 0.0777, + "step": 44490 + }, + { + "epoch": 2.9113509977101732, + "grad_norm": 0.8266311883926392, + "learning_rate": 8.97380204762009e-05, + "loss": 0.076, + "step": 44500 + }, + { + "epoch": 2.912005233889434, + "grad_norm": 0.8097569942474365, + "learning_rate": 8.973244464250474e-05, + "loss": 0.0814, + "step": 44510 + }, + { + "epoch": 2.9126594700686947, + "grad_norm": 0.8458988070487976, + "learning_rate": 8.972686746773436e-05, + "loss": 0.0836, + "step": 44520 + }, + { + "epoch": 2.9133137062479557, + "grad_norm": 0.8754256963729858, + "learning_rate": 8.972128895207803e-05, + "loss": 0.0954, + "step": 44530 + }, + { + "epoch": 2.9139679424272162, + "grad_norm": 0.7934485673904419, + "learning_rate": 8.971570909572401e-05, + "loss": 0.0784, + "step": 44540 + }, + { + "epoch": 2.9146221786064768, + "grad_norm": 0.9816693067550659, + "learning_rate": 8.971012789886066e-05, + "loss": 0.0724, + "step": 44550 + }, + { + "epoch": 2.9152764147857377, + "grad_norm": 0.8291686773300171, + "learning_rate": 8.970454536167634e-05, + "loss": 0.0774, + "step": 44560 + }, + { + "epoch": 2.9159306509649983, + "grad_norm": 0.7562156915664673, + "learning_rate": 8.96989614843595e-05, + "loss": 0.0802, + "step": 44570 + }, + { + "epoch": 2.9165848871442592, + "grad_norm": 0.680461049079895, + "learning_rate": 8.969337626709858e-05, + "loss": 0.0795, + "step": 44580 + }, + { + "epoch": 2.9172391233235198, + "grad_norm": 0.8741086721420288, + "learning_rate": 8.968778971008211e-05, + "loss": 0.0826, + "step": 44590 + }, + { + "epoch": 2.9178933595027807, + "grad_norm": 0.79926598072052, + "learning_rate": 8.968220181349866e-05, + "loss": 0.0677, + "step": 44600 + }, + { + "epoch": 2.9185475956820413, + "grad_norm": 0.7387502193450928, + "learning_rate": 8.967661257753683e-05, + "loss": 0.0756, + "step": 44610 + }, + { + "epoch": 2.9192018318613018, + "grad_norm": 0.8162443041801453, + "learning_rate": 8.967102200238527e-05, + "loss": 0.0716, + "step": 44620 + }, + { + "epoch": 2.9198560680405627, + "grad_norm": 0.8832371830940247, + "learning_rate": 8.966543008823268e-05, + "loss": 0.0711, + "step": 44630 + }, + { + "epoch": 2.9205103042198233, + "grad_norm": 0.8890617489814758, + "learning_rate": 8.965983683526779e-05, + "loss": 0.0754, + "step": 44640 + }, + { + "epoch": 2.921164540399084, + "grad_norm": 0.8303614854812622, + "learning_rate": 8.96542422436794e-05, + "loss": 0.0728, + "step": 44650 + }, + { + "epoch": 2.9218187765783448, + "grad_norm": 0.7883252501487732, + "learning_rate": 8.964864631365634e-05, + "loss": 0.0793, + "step": 44660 + }, + { + "epoch": 2.9224730127576057, + "grad_norm": 1.1344659328460693, + "learning_rate": 8.964304904538747e-05, + "loss": 0.0856, + "step": 44670 + }, + { + "epoch": 2.9231272489368663, + "grad_norm": 0.7178243398666382, + "learning_rate": 8.963745043906174e-05, + "loss": 0.0784, + "step": 44680 + }, + { + "epoch": 2.923781485116127, + "grad_norm": 0.9424094557762146, + "learning_rate": 8.96318504948681e-05, + "loss": 0.0805, + "step": 44690 + }, + { + "epoch": 2.9244357212953878, + "grad_norm": 0.6671366691589355, + "learning_rate": 8.962624921299558e-05, + "loss": 0.0822, + "step": 44700 + }, + { + "epoch": 2.9250899574746483, + "grad_norm": 0.7286959290504456, + "learning_rate": 8.962064659363321e-05, + "loss": 0.0875, + "step": 44710 + }, + { + "epoch": 2.925744193653909, + "grad_norm": 0.8117363452911377, + "learning_rate": 8.961504263697011e-05, + "loss": 0.0746, + "step": 44720 + }, + { + "epoch": 2.92639842983317, + "grad_norm": 0.8143247365951538, + "learning_rate": 8.960943734319542e-05, + "loss": 0.083, + "step": 44730 + }, + { + "epoch": 2.9270526660124307, + "grad_norm": 0.8705511689186096, + "learning_rate": 8.960383071249836e-05, + "loss": 0.0858, + "step": 44740 + }, + { + "epoch": 2.9277069021916913, + "grad_norm": 1.2389694452285767, + "learning_rate": 8.959822274506812e-05, + "loss": 0.0911, + "step": 44750 + }, + { + "epoch": 2.928361138370952, + "grad_norm": 0.7589144110679626, + "learning_rate": 8.959261344109404e-05, + "loss": 0.0772, + "step": 44760 + }, + { + "epoch": 2.9290153745502128, + "grad_norm": 0.8595485687255859, + "learning_rate": 8.958700280076542e-05, + "loss": 0.0727, + "step": 44770 + }, + { + "epoch": 2.9296696107294733, + "grad_norm": 1.0086075067520142, + "learning_rate": 8.958139082427162e-05, + "loss": 0.0796, + "step": 44780 + }, + { + "epoch": 2.930323846908734, + "grad_norm": 0.928106963634491, + "learning_rate": 8.957577751180209e-05, + "loss": 0.0692, + "step": 44790 + }, + { + "epoch": 2.930978083087995, + "grad_norm": 0.8739972114562988, + "learning_rate": 8.957016286354626e-05, + "loss": 0.0811, + "step": 44800 + }, + { + "epoch": 2.9316323192672553, + "grad_norm": 0.870585024356842, + "learning_rate": 8.956454687969367e-05, + "loss": 0.0709, + "step": 44810 + }, + { + "epoch": 2.9322865554465163, + "grad_norm": 1.0133564472198486, + "learning_rate": 8.955892956043385e-05, + "loss": 0.0771, + "step": 44820 + }, + { + "epoch": 2.932940791625777, + "grad_norm": 0.8007000684738159, + "learning_rate": 8.955331090595642e-05, + "loss": 0.0804, + "step": 44830 + }, + { + "epoch": 2.933595027805038, + "grad_norm": 0.8323869109153748, + "learning_rate": 8.9547690916451e-05, + "loss": 0.0738, + "step": 44840 + }, + { + "epoch": 2.9342492639842983, + "grad_norm": 0.6912589073181152, + "learning_rate": 8.954206959210731e-05, + "loss": 0.0738, + "step": 44850 + }, + { + "epoch": 2.934903500163559, + "grad_norm": 0.9906479716300964, + "learning_rate": 8.953644693311506e-05, + "loss": 0.0778, + "step": 44860 + }, + { + "epoch": 2.93555773634282, + "grad_norm": 0.6855130791664124, + "learning_rate": 8.953082293966404e-05, + "loss": 0.0713, + "step": 44870 + }, + { + "epoch": 2.9362119725220803, + "grad_norm": 0.7666285634040833, + "learning_rate": 8.952519761194407e-05, + "loss": 0.0767, + "step": 44880 + }, + { + "epoch": 2.9368662087013413, + "grad_norm": 0.8128482103347778, + "learning_rate": 8.951957095014503e-05, + "loss": 0.0734, + "step": 44890 + }, + { + "epoch": 2.937520444880602, + "grad_norm": 1.090126872062683, + "learning_rate": 8.951394295445681e-05, + "loss": 0.0873, + "step": 44900 + }, + { + "epoch": 2.938174681059863, + "grad_norm": 0.8404224514961243, + "learning_rate": 8.950831362506941e-05, + "loss": 0.073, + "step": 44910 + }, + { + "epoch": 2.9388289172391233, + "grad_norm": 0.8833885192871094, + "learning_rate": 8.950268296217279e-05, + "loss": 0.0803, + "step": 44920 + }, + { + "epoch": 2.939483153418384, + "grad_norm": 0.8460028767585754, + "learning_rate": 8.949705096595703e-05, + "loss": 0.0759, + "step": 44930 + }, + { + "epoch": 2.940137389597645, + "grad_norm": 0.7936156988143921, + "learning_rate": 8.949141763661222e-05, + "loss": 0.0911, + "step": 44940 + }, + { + "epoch": 2.9407916257769053, + "grad_norm": 0.8652473092079163, + "learning_rate": 8.948578297432848e-05, + "loss": 0.0829, + "step": 44950 + }, + { + "epoch": 2.9414458619561663, + "grad_norm": 0.9275278449058533, + "learning_rate": 8.948014697929603e-05, + "loss": 0.0766, + "step": 44960 + }, + { + "epoch": 2.942100098135427, + "grad_norm": 0.6803890466690063, + "learning_rate": 8.947450965170505e-05, + "loss": 0.0729, + "step": 44970 + }, + { + "epoch": 2.942754334314688, + "grad_norm": 0.8319152593612671, + "learning_rate": 8.946887099174587e-05, + "loss": 0.0726, + "step": 44980 + }, + { + "epoch": 2.9434085704939483, + "grad_norm": 0.9895662069320679, + "learning_rate": 8.946323099960877e-05, + "loss": 0.0735, + "step": 44990 + }, + { + "epoch": 2.944062806673209, + "grad_norm": 0.899389386177063, + "learning_rate": 8.945758967548415e-05, + "loss": 0.0678, + "step": 45000 + }, + { + "epoch": 2.94471704285247, + "grad_norm": 0.9055632948875427, + "learning_rate": 8.945194701956236e-05, + "loss": 0.0743, + "step": 45010 + }, + { + "epoch": 2.9453712790317303, + "grad_norm": 0.8592407703399658, + "learning_rate": 8.944630303203391e-05, + "loss": 0.0785, + "step": 45020 + }, + { + "epoch": 2.9460255152109913, + "grad_norm": 0.8779101967811584, + "learning_rate": 8.944065771308928e-05, + "loss": 0.0785, + "step": 45030 + }, + { + "epoch": 2.946679751390252, + "grad_norm": 0.8470450639724731, + "learning_rate": 8.943501106291901e-05, + "loss": 0.0746, + "step": 45040 + }, + { + "epoch": 2.947333987569513, + "grad_norm": 0.8635161519050598, + "learning_rate": 8.94293630817137e-05, + "loss": 0.0732, + "step": 45050 + }, + { + "epoch": 2.9479882237487733, + "grad_norm": 0.892550528049469, + "learning_rate": 8.942371376966398e-05, + "loss": 0.0829, + "step": 45060 + }, + { + "epoch": 2.948642459928034, + "grad_norm": 0.7665706276893616, + "learning_rate": 8.941806312696054e-05, + "loss": 0.0738, + "step": 45070 + }, + { + "epoch": 2.949296696107295, + "grad_norm": 0.8420522212982178, + "learning_rate": 8.941241115379408e-05, + "loss": 0.0801, + "step": 45080 + }, + { + "epoch": 2.9499509322865554, + "grad_norm": 0.8662616610527039, + "learning_rate": 8.940675785035538e-05, + "loss": 0.0738, + "step": 45090 + }, + { + "epoch": 2.950605168465816, + "grad_norm": 0.7357196807861328, + "learning_rate": 8.940110321683525e-05, + "loss": 0.0687, + "step": 45100 + }, + { + "epoch": 2.951259404645077, + "grad_norm": 0.8650752305984497, + "learning_rate": 8.939544725342454e-05, + "loss": 0.0807, + "step": 45110 + }, + { + "epoch": 2.951913640824338, + "grad_norm": 1.0531340837478638, + "learning_rate": 8.93897899603142e-05, + "loss": 0.09, + "step": 45120 + }, + { + "epoch": 2.9525678770035984, + "grad_norm": 0.7418067455291748, + "learning_rate": 8.938413133769514e-05, + "loss": 0.0777, + "step": 45130 + }, + { + "epoch": 2.953222113182859, + "grad_norm": 1.0505800247192383, + "learning_rate": 8.937847138575833e-05, + "loss": 0.0821, + "step": 45140 + }, + { + "epoch": 2.95387634936212, + "grad_norm": 0.9159501194953918, + "learning_rate": 8.937281010469486e-05, + "loss": 0.0742, + "step": 45150 + }, + { + "epoch": 2.9545305855413804, + "grad_norm": 0.8090870976448059, + "learning_rate": 8.936714749469579e-05, + "loss": 0.0825, + "step": 45160 + }, + { + "epoch": 2.955184821720641, + "grad_norm": 0.8931045532226562, + "learning_rate": 8.936148355595224e-05, + "loss": 0.0765, + "step": 45170 + }, + { + "epoch": 2.955839057899902, + "grad_norm": 0.8412348628044128, + "learning_rate": 8.93558182886554e-05, + "loss": 0.0762, + "step": 45180 + }, + { + "epoch": 2.956493294079163, + "grad_norm": 0.832350492477417, + "learning_rate": 8.935015169299646e-05, + "loss": 0.086, + "step": 45190 + }, + { + "epoch": 2.9571475302584234, + "grad_norm": 0.8142176866531372, + "learning_rate": 8.934448376916672e-05, + "loss": 0.0752, + "step": 45200 + }, + { + "epoch": 2.957801766437684, + "grad_norm": 0.7122102379798889, + "learning_rate": 8.933881451735746e-05, + "loss": 0.0754, + "step": 45210 + }, + { + "epoch": 2.958456002616945, + "grad_norm": 0.9692149758338928, + "learning_rate": 8.933314393776005e-05, + "loss": 0.0744, + "step": 45220 + }, + { + "epoch": 2.9591102387962054, + "grad_norm": 0.8291851878166199, + "learning_rate": 8.932747203056586e-05, + "loss": 0.0773, + "step": 45230 + }, + { + "epoch": 2.959764474975466, + "grad_norm": 0.7037114500999451, + "learning_rate": 8.932179879596636e-05, + "loss": 0.0838, + "step": 45240 + }, + { + "epoch": 2.960418711154727, + "grad_norm": 0.7591250538825989, + "learning_rate": 8.9316124234153e-05, + "loss": 0.0729, + "step": 45250 + }, + { + "epoch": 2.9610729473339874, + "grad_norm": 0.8546152114868164, + "learning_rate": 8.931044834531737e-05, + "loss": 0.074, + "step": 45260 + }, + { + "epoch": 2.9617271835132484, + "grad_norm": 0.8192047476768494, + "learning_rate": 8.930477112965102e-05, + "loss": 0.0773, + "step": 45270 + }, + { + "epoch": 2.962381419692509, + "grad_norm": 0.7500526905059814, + "learning_rate": 8.929909258734553e-05, + "loss": 0.0756, + "step": 45280 + }, + { + "epoch": 2.96303565587177, + "grad_norm": 0.8506186008453369, + "learning_rate": 8.929341271859262e-05, + "loss": 0.0812, + "step": 45290 + }, + { + "epoch": 2.9636898920510304, + "grad_norm": 0.7039439678192139, + "learning_rate": 8.928773152358398e-05, + "loss": 0.0697, + "step": 45300 + }, + { + "epoch": 2.964344128230291, + "grad_norm": 1.1579927206039429, + "learning_rate": 8.928204900251136e-05, + "loss": 0.0707, + "step": 45310 + }, + { + "epoch": 2.964998364409552, + "grad_norm": 0.8826687932014465, + "learning_rate": 8.927636515556657e-05, + "loss": 0.0853, + "step": 45320 + }, + { + "epoch": 2.9656526005888124, + "grad_norm": 0.7980294823646545, + "learning_rate": 8.927067998294145e-05, + "loss": 0.0846, + "step": 45330 + }, + { + "epoch": 2.9663068367680734, + "grad_norm": 0.7435978651046753, + "learning_rate": 8.926499348482787e-05, + "loss": 0.067, + "step": 45340 + }, + { + "epoch": 2.966961072947334, + "grad_norm": 0.715040922164917, + "learning_rate": 8.92593056614178e-05, + "loss": 0.0712, + "step": 45350 + }, + { + "epoch": 2.967615309126595, + "grad_norm": 0.9521136283874512, + "learning_rate": 8.925361651290321e-05, + "loss": 0.0809, + "step": 45360 + }, + { + "epoch": 2.9682695453058554, + "grad_norm": 0.7266244888305664, + "learning_rate": 8.924792603947611e-05, + "loss": 0.0802, + "step": 45370 + }, + { + "epoch": 2.968923781485116, + "grad_norm": 0.9576558470726013, + "learning_rate": 8.924223424132856e-05, + "loss": 0.0811, + "step": 45380 + }, + { + "epoch": 2.969578017664377, + "grad_norm": 0.8989412784576416, + "learning_rate": 8.92365411186527e-05, + "loss": 0.0844, + "step": 45390 + }, + { + "epoch": 2.9702322538436374, + "grad_norm": 0.9169609546661377, + "learning_rate": 8.923084667164067e-05, + "loss": 0.0728, + "step": 45400 + }, + { + "epoch": 2.9708864900228984, + "grad_norm": 0.9715309739112854, + "learning_rate": 8.92251509004847e-05, + "loss": 0.0868, + "step": 45410 + }, + { + "epoch": 2.971540726202159, + "grad_norm": 0.8212486505508423, + "learning_rate": 8.9219453805377e-05, + "loss": 0.0794, + "step": 45420 + }, + { + "epoch": 2.97219496238142, + "grad_norm": 0.8119533658027649, + "learning_rate": 8.921375538650987e-05, + "loss": 0.0831, + "step": 45430 + }, + { + "epoch": 2.9728491985606804, + "grad_norm": 0.8991037011146545, + "learning_rate": 8.920805564407565e-05, + "loss": 0.081, + "step": 45440 + }, + { + "epoch": 2.973503434739941, + "grad_norm": 0.6757112145423889, + "learning_rate": 8.920235457826675e-05, + "loss": 0.0706, + "step": 45450 + }, + { + "epoch": 2.974157670919202, + "grad_norm": 0.8806977272033691, + "learning_rate": 8.919665218927556e-05, + "loss": 0.068, + "step": 45460 + }, + { + "epoch": 2.9748119070984624, + "grad_norm": 1.013301968574524, + "learning_rate": 8.919094847729455e-05, + "loss": 0.0812, + "step": 45470 + }, + { + "epoch": 2.9754661432777234, + "grad_norm": 0.9366437792778015, + "learning_rate": 8.918524344251626e-05, + "loss": 0.0773, + "step": 45480 + }, + { + "epoch": 2.976120379456984, + "grad_norm": 0.8117071986198425, + "learning_rate": 8.917953708513324e-05, + "loss": 0.0676, + "step": 45490 + }, + { + "epoch": 2.976774615636245, + "grad_norm": 1.0920495986938477, + "learning_rate": 8.917382940533808e-05, + "loss": 0.0787, + "step": 45500 + }, + { + "epoch": 2.9774288518155054, + "grad_norm": 0.7851189970970154, + "learning_rate": 8.916812040332344e-05, + "loss": 0.0745, + "step": 45510 + }, + { + "epoch": 2.978083087994766, + "grad_norm": 0.8046877384185791, + "learning_rate": 8.916241007928203e-05, + "loss": 0.0787, + "step": 45520 + }, + { + "epoch": 2.978737324174027, + "grad_norm": 0.8224988579750061, + "learning_rate": 8.915669843340655e-05, + "loss": 0.0895, + "step": 45530 + }, + { + "epoch": 2.9793915603532874, + "grad_norm": 0.9415143132209778, + "learning_rate": 8.915098546588983e-05, + "loss": 0.0723, + "step": 45540 + }, + { + "epoch": 2.980045796532548, + "grad_norm": 0.9764366149902344, + "learning_rate": 8.914527117692465e-05, + "loss": 0.084, + "step": 45550 + }, + { + "epoch": 2.980700032711809, + "grad_norm": 0.774044930934906, + "learning_rate": 8.913955556670392e-05, + "loss": 0.0786, + "step": 45560 + }, + { + "epoch": 2.98135426889107, + "grad_norm": 0.8664652705192566, + "learning_rate": 8.913383863542054e-05, + "loss": 0.0765, + "step": 45570 + }, + { + "epoch": 2.9820085050703304, + "grad_norm": 0.8627799153327942, + "learning_rate": 8.912812038326746e-05, + "loss": 0.0839, + "step": 45580 + }, + { + "epoch": 2.982662741249591, + "grad_norm": 0.8925349712371826, + "learning_rate": 8.912240081043773e-05, + "loss": 0.0813, + "step": 45590 + }, + { + "epoch": 2.983316977428852, + "grad_norm": 0.8013269305229187, + "learning_rate": 8.911667991712433e-05, + "loss": 0.0722, + "step": 45600 + }, + { + "epoch": 2.9839712136081125, + "grad_norm": 0.6747923493385315, + "learning_rate": 8.911095770352043e-05, + "loss": 0.0757, + "step": 45610 + }, + { + "epoch": 2.984625449787373, + "grad_norm": 0.8834814429283142, + "learning_rate": 8.910523416981911e-05, + "loss": 0.081, + "step": 45620 + }, + { + "epoch": 2.985279685966634, + "grad_norm": 0.8894004225730896, + "learning_rate": 8.90995093162136e-05, + "loss": 0.0748, + "step": 45630 + }, + { + "epoch": 2.985933922145895, + "grad_norm": 0.7748491168022156, + "learning_rate": 8.909378314289708e-05, + "loss": 0.0718, + "step": 45640 + }, + { + "epoch": 2.9865881583251555, + "grad_norm": 0.8032260537147522, + "learning_rate": 8.908805565006288e-05, + "loss": 0.0714, + "step": 45650 + }, + { + "epoch": 2.987242394504416, + "grad_norm": 0.7444846630096436, + "learning_rate": 8.90823268379043e-05, + "loss": 0.0705, + "step": 45660 + }, + { + "epoch": 2.987896630683677, + "grad_norm": 0.8824648261070251, + "learning_rate": 8.907659670661467e-05, + "loss": 0.087, + "step": 45670 + }, + { + "epoch": 2.9885508668629375, + "grad_norm": 0.7613914608955383, + "learning_rate": 8.907086525638741e-05, + "loss": 0.0791, + "step": 45680 + }, + { + "epoch": 2.989205103042198, + "grad_norm": 1.0592745542526245, + "learning_rate": 8.9065132487416e-05, + "loss": 0.0757, + "step": 45690 + }, + { + "epoch": 2.989859339221459, + "grad_norm": 0.8600308299064636, + "learning_rate": 8.905939839989391e-05, + "loss": 0.0767, + "step": 45700 + }, + { + "epoch": 2.99051357540072, + "grad_norm": 0.9262317419052124, + "learning_rate": 8.90536629940147e-05, + "loss": 0.0879, + "step": 45710 + }, + { + "epoch": 2.9911678115799805, + "grad_norm": 0.7141725420951843, + "learning_rate": 8.904792626997191e-05, + "loss": 0.0699, + "step": 45720 + }, + { + "epoch": 2.991822047759241, + "grad_norm": 0.8366238474845886, + "learning_rate": 8.904218822795923e-05, + "loss": 0.0717, + "step": 45730 + }, + { + "epoch": 2.992476283938502, + "grad_norm": 0.937675416469574, + "learning_rate": 8.903644886817029e-05, + "loss": 0.0782, + "step": 45740 + }, + { + "epoch": 2.9931305201177625, + "grad_norm": 0.7373892068862915, + "learning_rate": 8.903070819079884e-05, + "loss": 0.0793, + "step": 45750 + }, + { + "epoch": 2.993784756297023, + "grad_norm": 0.9344786405563354, + "learning_rate": 8.902496619603862e-05, + "loss": 0.0852, + "step": 45760 + }, + { + "epoch": 2.994438992476284, + "grad_norm": 0.7279993295669556, + "learning_rate": 8.901922288408343e-05, + "loss": 0.0699, + "step": 45770 + }, + { + "epoch": 2.9950932286555445, + "grad_norm": 1.427483081817627, + "learning_rate": 8.901347825512715e-05, + "loss": 0.0969, + "step": 45780 + }, + { + "epoch": 2.9957474648348055, + "grad_norm": 0.707801103591919, + "learning_rate": 8.900773230936366e-05, + "loss": 0.0845, + "step": 45790 + }, + { + "epoch": 2.996401701014066, + "grad_norm": 0.7957580089569092, + "learning_rate": 8.900198504698689e-05, + "loss": 0.0735, + "step": 45800 + }, + { + "epoch": 2.997055937193327, + "grad_norm": 1.1683952808380127, + "learning_rate": 8.899623646819087e-05, + "loss": 0.0686, + "step": 45810 + }, + { + "epoch": 2.9977101733725875, + "grad_norm": 0.8779153227806091, + "learning_rate": 8.899048657316956e-05, + "loss": 0.0689, + "step": 45820 + }, + { + "epoch": 2.998364409551848, + "grad_norm": 0.8528333306312561, + "learning_rate": 8.89847353621171e-05, + "loss": 0.0696, + "step": 45830 + }, + { + "epoch": 2.999018645731109, + "grad_norm": 0.9389132261276245, + "learning_rate": 8.897898283522756e-05, + "loss": 0.0808, + "step": 45840 + }, + { + "epoch": 2.9996728819103695, + "grad_norm": 0.8795191049575806, + "learning_rate": 8.897322899269513e-05, + "loss": 0.0823, + "step": 45850 + }, + { + "epoch": 3.0003271180896305, + "grad_norm": 0.8083428740501404, + "learning_rate": 8.896747383471402e-05, + "loss": 0.0703, + "step": 45860 + }, + { + "epoch": 3.000981354268891, + "grad_norm": 1.016609787940979, + "learning_rate": 8.896171736147846e-05, + "loss": 0.0926, + "step": 45870 + }, + { + "epoch": 3.001635590448152, + "grad_norm": 0.9670884609222412, + "learning_rate": 8.895595957318277e-05, + "loss": 0.0756, + "step": 45880 + }, + { + "epoch": 3.0022898266274125, + "grad_norm": 0.9883562922477722, + "learning_rate": 8.895020047002127e-05, + "loss": 0.0797, + "step": 45890 + }, + { + "epoch": 3.002944062806673, + "grad_norm": 0.7922661304473877, + "learning_rate": 8.894444005218835e-05, + "loss": 0.0877, + "step": 45900 + }, + { + "epoch": 3.003598298985934, + "grad_norm": 0.8021111488342285, + "learning_rate": 8.893867831987845e-05, + "loss": 0.0731, + "step": 45910 + }, + { + "epoch": 3.0042525351651945, + "grad_norm": 0.8403140902519226, + "learning_rate": 8.893291527328604e-05, + "loss": 0.0819, + "step": 45920 + }, + { + "epoch": 3.0049067713444555, + "grad_norm": 1.021638035774231, + "learning_rate": 8.892715091260564e-05, + "loss": 0.077, + "step": 45930 + }, + { + "epoch": 3.005561007523716, + "grad_norm": 0.723585307598114, + "learning_rate": 8.89213852380318e-05, + "loss": 0.0711, + "step": 45940 + }, + { + "epoch": 3.006215243702977, + "grad_norm": 0.807429313659668, + "learning_rate": 8.891561824975911e-05, + "loss": 0.0756, + "step": 45950 + }, + { + "epoch": 3.0068694798822375, + "grad_norm": 0.9832606911659241, + "learning_rate": 8.890984994798229e-05, + "loss": 0.0752, + "step": 45960 + }, + { + "epoch": 3.007523716061498, + "grad_norm": 0.9934665560722351, + "learning_rate": 8.890408033289595e-05, + "loss": 0.0826, + "step": 45970 + }, + { + "epoch": 3.008177952240759, + "grad_norm": 0.8675631880760193, + "learning_rate": 8.889830940469487e-05, + "loss": 0.0748, + "step": 45980 + }, + { + "epoch": 3.0088321884200195, + "grad_norm": 0.8953355550765991, + "learning_rate": 8.889253716357385e-05, + "loss": 0.0859, + "step": 45990 + }, + { + "epoch": 3.0094864245992805, + "grad_norm": 0.9766877889633179, + "learning_rate": 8.88867636097277e-05, + "loss": 0.0766, + "step": 46000 + }, + { + "epoch": 3.010140660778541, + "grad_norm": 0.8925821185112, + "learning_rate": 8.888098874335129e-05, + "loss": 0.0754, + "step": 46010 + }, + { + "epoch": 3.0107948969578016, + "grad_norm": 0.9773553013801575, + "learning_rate": 8.887521256463953e-05, + "loss": 0.0707, + "step": 46020 + }, + { + "epoch": 3.0114491331370625, + "grad_norm": 1.1067787408828735, + "learning_rate": 8.886943507378741e-05, + "loss": 0.0846, + "step": 46030 + }, + { + "epoch": 3.012103369316323, + "grad_norm": 0.7693238854408264, + "learning_rate": 8.886365627098991e-05, + "loss": 0.0787, + "step": 46040 + }, + { + "epoch": 3.012757605495584, + "grad_norm": 0.9626716375350952, + "learning_rate": 8.885787615644208e-05, + "loss": 0.0769, + "step": 46050 + }, + { + "epoch": 3.0134118416748445, + "grad_norm": 1.0409643650054932, + "learning_rate": 8.885209473033905e-05, + "loss": 0.076, + "step": 46060 + }, + { + "epoch": 3.0140660778541055, + "grad_norm": 0.8555409908294678, + "learning_rate": 8.88463119928759e-05, + "loss": 0.0866, + "step": 46070 + }, + { + "epoch": 3.014720314033366, + "grad_norm": 0.8579902052879333, + "learning_rate": 8.884052794424785e-05, + "loss": 0.0643, + "step": 46080 + }, + { + "epoch": 3.0153745502126266, + "grad_norm": 1.0672752857208252, + "learning_rate": 8.883474258465013e-05, + "loss": 0.084, + "step": 46090 + }, + { + "epoch": 3.0160287863918875, + "grad_norm": 0.9049206972122192, + "learning_rate": 8.882895591427797e-05, + "loss": 0.0775, + "step": 46100 + }, + { + "epoch": 3.016683022571148, + "grad_norm": 0.9947949647903442, + "learning_rate": 8.882316793332674e-05, + "loss": 0.0693, + "step": 46110 + }, + { + "epoch": 3.017337258750409, + "grad_norm": 0.6652380228042603, + "learning_rate": 8.881737864199177e-05, + "loss": 0.0727, + "step": 46120 + }, + { + "epoch": 3.0179914949296696, + "grad_norm": 0.7874847650527954, + "learning_rate": 8.881158804046847e-05, + "loss": 0.0773, + "step": 46130 + }, + { + "epoch": 3.0186457311089305, + "grad_norm": 0.8908420205116272, + "learning_rate": 8.880579612895227e-05, + "loss": 0.0756, + "step": 46140 + }, + { + "epoch": 3.019299967288191, + "grad_norm": 0.7234876751899719, + "learning_rate": 8.88000029076387e-05, + "loss": 0.0749, + "step": 46150 + }, + { + "epoch": 3.0199542034674516, + "grad_norm": 0.6146116256713867, + "learning_rate": 8.879420837672327e-05, + "loss": 0.073, + "step": 46160 + }, + { + "epoch": 3.0206084396467126, + "grad_norm": 0.8692243099212646, + "learning_rate": 8.878841253640156e-05, + "loss": 0.0831, + "step": 46170 + }, + { + "epoch": 3.021262675825973, + "grad_norm": 0.9530091881752014, + "learning_rate": 8.878261538686921e-05, + "loss": 0.0667, + "step": 46180 + }, + { + "epoch": 3.021916912005234, + "grad_norm": 0.8727622032165527, + "learning_rate": 8.877681692832187e-05, + "loss": 0.075, + "step": 46190 + }, + { + "epoch": 3.0225711481844946, + "grad_norm": 0.7857223749160767, + "learning_rate": 8.877101716095525e-05, + "loss": 0.0768, + "step": 46200 + }, + { + "epoch": 3.023225384363755, + "grad_norm": 0.929732084274292, + "learning_rate": 8.876521608496516e-05, + "loss": 0.0851, + "step": 46210 + }, + { + "epoch": 3.023879620543016, + "grad_norm": 0.8777803182601929, + "learning_rate": 8.875941370054733e-05, + "loss": 0.0718, + "step": 46220 + }, + { + "epoch": 3.0245338567222766, + "grad_norm": 0.99214106798172, + "learning_rate": 8.875361000789764e-05, + "loss": 0.0747, + "step": 46230 + }, + { + "epoch": 3.0251880929015376, + "grad_norm": 1.0649714469909668, + "learning_rate": 8.874780500721198e-05, + "loss": 0.0842, + "step": 46240 + }, + { + "epoch": 3.025842329080798, + "grad_norm": 0.7679323554039001, + "learning_rate": 8.874199869868629e-05, + "loss": 0.0746, + "step": 46250 + }, + { + "epoch": 3.026496565260059, + "grad_norm": 0.8521194458007812, + "learning_rate": 8.873619108251653e-05, + "loss": 0.0822, + "step": 46260 + }, + { + "epoch": 3.0271508014393196, + "grad_norm": 1.05009126663208, + "learning_rate": 8.873038215889872e-05, + "loss": 0.0786, + "step": 46270 + }, + { + "epoch": 3.02780503761858, + "grad_norm": 0.8912391662597656, + "learning_rate": 8.872457192802897e-05, + "loss": 0.0807, + "step": 46280 + }, + { + "epoch": 3.028459273797841, + "grad_norm": 0.7786313891410828, + "learning_rate": 8.871876039010334e-05, + "loss": 0.0881, + "step": 46290 + }, + { + "epoch": 3.0291135099771016, + "grad_norm": 0.6473716497421265, + "learning_rate": 8.8712947545318e-05, + "loss": 0.0724, + "step": 46300 + }, + { + "epoch": 3.0297677461563626, + "grad_norm": 0.8231253027915955, + "learning_rate": 8.870713339386916e-05, + "loss": 0.068, + "step": 46310 + }, + { + "epoch": 3.030421982335623, + "grad_norm": 0.8397748470306396, + "learning_rate": 8.870131793595304e-05, + "loss": 0.0842, + "step": 46320 + }, + { + "epoch": 3.031076218514884, + "grad_norm": 1.0040104389190674, + "learning_rate": 8.869550117176597e-05, + "loss": 0.0821, + "step": 46330 + }, + { + "epoch": 3.0317304546941446, + "grad_norm": 0.7324417233467102, + "learning_rate": 8.868968310150423e-05, + "loss": 0.073, + "step": 46340 + }, + { + "epoch": 3.032384690873405, + "grad_norm": 0.8197325468063354, + "learning_rate": 8.868386372536423e-05, + "loss": 0.0722, + "step": 46350 + }, + { + "epoch": 3.033038927052666, + "grad_norm": 0.8666032552719116, + "learning_rate": 8.867804304354237e-05, + "loss": 0.0721, + "step": 46360 + }, + { + "epoch": 3.0336931632319266, + "grad_norm": 0.9289610981941223, + "learning_rate": 8.867222105623512e-05, + "loss": 0.0715, + "step": 46370 + }, + { + "epoch": 3.0343473994111876, + "grad_norm": 0.851219892501831, + "learning_rate": 8.866639776363898e-05, + "loss": 0.078, + "step": 46380 + }, + { + "epoch": 3.035001635590448, + "grad_norm": 0.9999982118606567, + "learning_rate": 8.866057316595053e-05, + "loss": 0.081, + "step": 46390 + }, + { + "epoch": 3.035655871769709, + "grad_norm": 0.8318057656288147, + "learning_rate": 8.865474726336632e-05, + "loss": 0.0796, + "step": 46400 + }, + { + "epoch": 3.0363101079489696, + "grad_norm": 0.7948132157325745, + "learning_rate": 8.864892005608303e-05, + "loss": 0.0706, + "step": 46410 + }, + { + "epoch": 3.03696434412823, + "grad_norm": 0.9667806625366211, + "learning_rate": 8.864309154429733e-05, + "loss": 0.0714, + "step": 46420 + }, + { + "epoch": 3.037618580307491, + "grad_norm": 0.821867823600769, + "learning_rate": 8.863726172820593e-05, + "loss": 0.0669, + "step": 46430 + }, + { + "epoch": 3.0382728164867516, + "grad_norm": 0.8645114302635193, + "learning_rate": 8.863143060800563e-05, + "loss": 0.0738, + "step": 46440 + }, + { + "epoch": 3.0389270526660126, + "grad_norm": 1.0619080066680908, + "learning_rate": 8.862559818389322e-05, + "loss": 0.0883, + "step": 46450 + }, + { + "epoch": 3.039581288845273, + "grad_norm": 0.9746699333190918, + "learning_rate": 8.861976445606559e-05, + "loss": 0.0819, + "step": 46460 + }, + { + "epoch": 3.0402355250245336, + "grad_norm": 0.8296138644218445, + "learning_rate": 8.861392942471961e-05, + "loss": 0.073, + "step": 46470 + }, + { + "epoch": 3.0408897612037946, + "grad_norm": 0.7673683762550354, + "learning_rate": 8.860809309005224e-05, + "loss": 0.0815, + "step": 46480 + }, + { + "epoch": 3.041543997383055, + "grad_norm": 0.9790584444999695, + "learning_rate": 8.860225545226049e-05, + "loss": 0.084, + "step": 46490 + }, + { + "epoch": 3.042198233562316, + "grad_norm": 0.7433209419250488, + "learning_rate": 8.859641651154138e-05, + "loss": 0.0741, + "step": 46500 + }, + { + "epoch": 3.0428524697415766, + "grad_norm": 0.8458490371704102, + "learning_rate": 8.859057626809198e-05, + "loss": 0.0691, + "step": 46510 + }, + { + "epoch": 3.0435067059208376, + "grad_norm": 0.8308846950531006, + "learning_rate": 8.858473472210944e-05, + "loss": 0.0762, + "step": 46520 + }, + { + "epoch": 3.044160942100098, + "grad_norm": 0.8369269967079163, + "learning_rate": 8.857889187379089e-05, + "loss": 0.0701, + "step": 46530 + }, + { + "epoch": 3.0448151782793587, + "grad_norm": 0.8432846665382385, + "learning_rate": 8.857304772333357e-05, + "loss": 0.0691, + "step": 46540 + }, + { + "epoch": 3.0454694144586196, + "grad_norm": 0.7842530608177185, + "learning_rate": 8.856720227093474e-05, + "loss": 0.085, + "step": 46550 + }, + { + "epoch": 3.04612365063788, + "grad_norm": 0.7288837432861328, + "learning_rate": 8.856135551679166e-05, + "loss": 0.0784, + "step": 46560 + }, + { + "epoch": 3.046777886817141, + "grad_norm": 0.7969174981117249, + "learning_rate": 8.855550746110171e-05, + "loss": 0.0849, + "step": 46570 + }, + { + "epoch": 3.0474321229964016, + "grad_norm": 0.9206051826477051, + "learning_rate": 8.854965810406229e-05, + "loss": 0.083, + "step": 46580 + }, + { + "epoch": 3.0480863591756626, + "grad_norm": 0.8779579401016235, + "learning_rate": 8.854380744587078e-05, + "loss": 0.0749, + "step": 46590 + }, + { + "epoch": 3.048740595354923, + "grad_norm": 0.9052606225013733, + "learning_rate": 8.85379554867247e-05, + "loss": 0.0712, + "step": 46600 + }, + { + "epoch": 3.0493948315341837, + "grad_norm": 0.8449774980545044, + "learning_rate": 8.853210222682156e-05, + "loss": 0.0723, + "step": 46610 + }, + { + "epoch": 3.0500490677134446, + "grad_norm": 0.8391105532646179, + "learning_rate": 8.85262476663589e-05, + "loss": 0.0764, + "step": 46620 + }, + { + "epoch": 3.050703303892705, + "grad_norm": 0.8949602842330933, + "learning_rate": 8.852039180553436e-05, + "loss": 0.0765, + "step": 46630 + }, + { + "epoch": 3.051357540071966, + "grad_norm": 0.7765173316001892, + "learning_rate": 8.851453464454554e-05, + "loss": 0.081, + "step": 46640 + }, + { + "epoch": 3.0520117762512267, + "grad_norm": 0.7792592644691467, + "learning_rate": 8.85086761835902e-05, + "loss": 0.0815, + "step": 46650 + }, + { + "epoch": 3.052666012430487, + "grad_norm": 0.8847215175628662, + "learning_rate": 8.850281642286603e-05, + "loss": 0.071, + "step": 46660 + }, + { + "epoch": 3.053320248609748, + "grad_norm": 0.9311257004737854, + "learning_rate": 8.849695536257083e-05, + "loss": 0.0749, + "step": 46670 + }, + { + "epoch": 3.0539744847890087, + "grad_norm": 0.7684807181358337, + "learning_rate": 8.849109300290242e-05, + "loss": 0.0735, + "step": 46680 + }, + { + "epoch": 3.0546287209682697, + "grad_norm": 0.864181399345398, + "learning_rate": 8.84852293440587e-05, + "loss": 0.0766, + "step": 46690 + }, + { + "epoch": 3.05528295714753, + "grad_norm": 0.8090917468070984, + "learning_rate": 8.847936438623754e-05, + "loss": 0.0857, + "step": 46700 + }, + { + "epoch": 3.055937193326791, + "grad_norm": 0.8556300401687622, + "learning_rate": 8.84734981296369e-05, + "loss": 0.0884, + "step": 46710 + }, + { + "epoch": 3.0565914295060517, + "grad_norm": 0.860185444355011, + "learning_rate": 8.846763057445481e-05, + "loss": 0.0749, + "step": 46720 + }, + { + "epoch": 3.057245665685312, + "grad_norm": 0.983341634273529, + "learning_rate": 8.846176172088932e-05, + "loss": 0.078, + "step": 46730 + }, + { + "epoch": 3.057899901864573, + "grad_norm": 0.7510449886322021, + "learning_rate": 8.84558915691385e-05, + "loss": 0.0709, + "step": 46740 + }, + { + "epoch": 3.0585541380438337, + "grad_norm": 0.8257303237915039, + "learning_rate": 8.845002011940047e-05, + "loss": 0.0808, + "step": 46750 + }, + { + "epoch": 3.0592083742230947, + "grad_norm": 0.8801044821739197, + "learning_rate": 8.844414737187342e-05, + "loss": 0.0736, + "step": 46760 + }, + { + "epoch": 3.059862610402355, + "grad_norm": 0.8823607563972473, + "learning_rate": 8.843827332675558e-05, + "loss": 0.0812, + "step": 46770 + }, + { + "epoch": 3.060516846581616, + "grad_norm": 1.0760579109191895, + "learning_rate": 8.84323979842452e-05, + "loss": 0.0779, + "step": 46780 + }, + { + "epoch": 3.0611710827608767, + "grad_norm": 0.8162868022918701, + "learning_rate": 8.842652134454061e-05, + "loss": 0.0686, + "step": 46790 + }, + { + "epoch": 3.061825318940137, + "grad_norm": 0.8627833724021912, + "learning_rate": 8.842064340784013e-05, + "loss": 0.0785, + "step": 46800 + }, + { + "epoch": 3.062479555119398, + "grad_norm": 0.6602069735527039, + "learning_rate": 8.84147641743422e-05, + "loss": 0.0682, + "step": 46810 + }, + { + "epoch": 3.0631337912986587, + "grad_norm": 0.8161008954048157, + "learning_rate": 8.84088836442452e-05, + "loss": 0.0776, + "step": 46820 + }, + { + "epoch": 3.0637880274779197, + "grad_norm": 0.9480443000793457, + "learning_rate": 8.840300181774767e-05, + "loss": 0.0783, + "step": 46830 + }, + { + "epoch": 3.06444226365718, + "grad_norm": 0.92530757188797, + "learning_rate": 8.83971186950481e-05, + "loss": 0.0822, + "step": 46840 + }, + { + "epoch": 3.065096499836441, + "grad_norm": 1.08670973777771, + "learning_rate": 8.839123427634508e-05, + "loss": 0.0785, + "step": 46850 + }, + { + "epoch": 3.0657507360157017, + "grad_norm": 0.8924565315246582, + "learning_rate": 8.838534856183722e-05, + "loss": 0.0715, + "step": 46860 + }, + { + "epoch": 3.066404972194962, + "grad_norm": 0.6959300637245178, + "learning_rate": 8.837946155172318e-05, + "loss": 0.0748, + "step": 46870 + }, + { + "epoch": 3.067059208374223, + "grad_norm": 0.7405787110328674, + "learning_rate": 8.837357324620164e-05, + "loss": 0.0748, + "step": 46880 + }, + { + "epoch": 3.0677134445534837, + "grad_norm": 0.8853090405464172, + "learning_rate": 8.836768364547139e-05, + "loss": 0.0854, + "step": 46890 + }, + { + "epoch": 3.0683676807327447, + "grad_norm": 0.9329870939254761, + "learning_rate": 8.836179274973118e-05, + "loss": 0.0803, + "step": 46900 + }, + { + "epoch": 3.069021916912005, + "grad_norm": 0.9848145246505737, + "learning_rate": 8.835590055917984e-05, + "loss": 0.0866, + "step": 46910 + }, + { + "epoch": 3.0696761530912657, + "grad_norm": 0.8550765514373779, + "learning_rate": 8.835000707401628e-05, + "loss": 0.0753, + "step": 46920 + }, + { + "epoch": 3.0703303892705267, + "grad_norm": 0.7886782288551331, + "learning_rate": 8.834411229443942e-05, + "loss": 0.0732, + "step": 46930 + }, + { + "epoch": 3.0709846254497872, + "grad_norm": 0.7411202788352966, + "learning_rate": 8.833821622064819e-05, + "loss": 0.0774, + "step": 46940 + }, + { + "epoch": 3.071638861629048, + "grad_norm": 1.315242052078247, + "learning_rate": 8.833231885284164e-05, + "loss": 0.0744, + "step": 46950 + }, + { + "epoch": 3.0722930978083087, + "grad_norm": 0.9081234335899353, + "learning_rate": 8.832642019121877e-05, + "loss": 0.0826, + "step": 46960 + }, + { + "epoch": 3.0729473339875697, + "grad_norm": 0.8793578147888184, + "learning_rate": 8.832052023597872e-05, + "loss": 0.0802, + "step": 46970 + }, + { + "epoch": 3.07360157016683, + "grad_norm": 0.9718412756919861, + "learning_rate": 8.831461898732061e-05, + "loss": 0.0784, + "step": 46980 + }, + { + "epoch": 3.0742558063460907, + "grad_norm": 0.7683785557746887, + "learning_rate": 8.83087164454436e-05, + "loss": 0.0773, + "step": 46990 + }, + { + "epoch": 3.0749100425253517, + "grad_norm": 0.8692295551300049, + "learning_rate": 8.830281261054698e-05, + "loss": 0.0702, + "step": 47000 + }, + { + "epoch": 3.0755642787046122, + "grad_norm": 0.8362782597541809, + "learning_rate": 8.829690748282997e-05, + "loss": 0.0676, + "step": 47010 + }, + { + "epoch": 3.076218514883873, + "grad_norm": 1.0380651950836182, + "learning_rate": 8.829100106249189e-05, + "loss": 0.0862, + "step": 47020 + }, + { + "epoch": 3.0768727510631337, + "grad_norm": 0.9438252449035645, + "learning_rate": 8.828509334973209e-05, + "loss": 0.0737, + "step": 47030 + }, + { + "epoch": 3.0775269872423947, + "grad_norm": 0.8855924606323242, + "learning_rate": 8.827918434475001e-05, + "loss": 0.0872, + "step": 47040 + }, + { + "epoch": 3.0781812234216552, + "grad_norm": 0.8621238470077515, + "learning_rate": 8.827327404774505e-05, + "loss": 0.0671, + "step": 47050 + }, + { + "epoch": 3.0788354596009158, + "grad_norm": 0.7865177392959595, + "learning_rate": 8.826736245891672e-05, + "loss": 0.0722, + "step": 47060 + }, + { + "epoch": 3.0794896957801767, + "grad_norm": 1.0064588785171509, + "learning_rate": 8.826144957846455e-05, + "loss": 0.069, + "step": 47070 + }, + { + "epoch": 3.0801439319594373, + "grad_norm": 0.8774595856666565, + "learning_rate": 8.825553540658811e-05, + "loss": 0.0753, + "step": 47080 + }, + { + "epoch": 3.0807981681386982, + "grad_norm": 0.9509215950965881, + "learning_rate": 8.824961994348701e-05, + "loss": 0.0741, + "step": 47090 + }, + { + "epoch": 3.0814524043179587, + "grad_norm": 0.8457993865013123, + "learning_rate": 8.824370318936095e-05, + "loss": 0.0712, + "step": 47100 + }, + { + "epoch": 3.0821066404972193, + "grad_norm": 0.8608481884002686, + "learning_rate": 8.823778514440959e-05, + "loss": 0.0801, + "step": 47110 + }, + { + "epoch": 3.0827608766764802, + "grad_norm": 0.9374340772628784, + "learning_rate": 8.82318658088327e-05, + "loss": 0.0866, + "step": 47120 + }, + { + "epoch": 3.0834151128557408, + "grad_norm": 0.9799492955207825, + "learning_rate": 8.822594518283008e-05, + "loss": 0.0875, + "step": 47130 + }, + { + "epoch": 3.0840693490350017, + "grad_norm": 0.8801363706588745, + "learning_rate": 8.822002326660154e-05, + "loss": 0.0756, + "step": 47140 + }, + { + "epoch": 3.0847235852142623, + "grad_norm": 0.8965553641319275, + "learning_rate": 8.8214100060347e-05, + "loss": 0.0792, + "step": 47150 + }, + { + "epoch": 3.0853778213935232, + "grad_norm": 0.8476041555404663, + "learning_rate": 8.820817556426636e-05, + "loss": 0.075, + "step": 47160 + }, + { + "epoch": 3.0860320575727838, + "grad_norm": 0.9920099973678589, + "learning_rate": 8.82022497785596e-05, + "loss": 0.0683, + "step": 47170 + }, + { + "epoch": 3.0866862937520443, + "grad_norm": 0.8582835793495178, + "learning_rate": 8.81963227034267e-05, + "loss": 0.0713, + "step": 47180 + }, + { + "epoch": 3.0873405299313053, + "grad_norm": 1.1154992580413818, + "learning_rate": 8.819039433906774e-05, + "loss": 0.0937, + "step": 47190 + }, + { + "epoch": 3.087994766110566, + "grad_norm": 1.022351861000061, + "learning_rate": 8.818446468568282e-05, + "loss": 0.0752, + "step": 47200 + }, + { + "epoch": 3.0886490022898268, + "grad_norm": 0.8366675972938538, + "learning_rate": 8.817853374347206e-05, + "loss": 0.0817, + "step": 47210 + }, + { + "epoch": 3.0893032384690873, + "grad_norm": 0.9166560769081116, + "learning_rate": 8.817260151263568e-05, + "loss": 0.0757, + "step": 47220 + }, + { + "epoch": 3.0899574746483482, + "grad_norm": 1.0350792407989502, + "learning_rate": 8.816666799337388e-05, + "loss": 0.0726, + "step": 47230 + }, + { + "epoch": 3.0906117108276088, + "grad_norm": 1.0422755479812622, + "learning_rate": 8.816073318588693e-05, + "loss": 0.0991, + "step": 47240 + }, + { + "epoch": 3.0912659470068693, + "grad_norm": 1.030333161354065, + "learning_rate": 8.815479709037515e-05, + "loss": 0.0694, + "step": 47250 + }, + { + "epoch": 3.0919201831861303, + "grad_norm": 1.0390321016311646, + "learning_rate": 8.814885970703893e-05, + "loss": 0.0855, + "step": 47260 + }, + { + "epoch": 3.092574419365391, + "grad_norm": 0.8490506410598755, + "learning_rate": 8.814292103607862e-05, + "loss": 0.0812, + "step": 47270 + }, + { + "epoch": 3.0932286555446518, + "grad_norm": 0.8992865681648254, + "learning_rate": 8.813698107769471e-05, + "loss": 0.0766, + "step": 47280 + }, + { + "epoch": 3.0938828917239123, + "grad_norm": 0.9719147086143494, + "learning_rate": 8.813103983208766e-05, + "loss": 0.0738, + "step": 47290 + }, + { + "epoch": 3.0945371279031733, + "grad_norm": 0.7177313566207886, + "learning_rate": 8.812509729945802e-05, + "loss": 0.0762, + "step": 47300 + }, + { + "epoch": 3.095191364082434, + "grad_norm": 0.8135479092597961, + "learning_rate": 8.811915348000635e-05, + "loss": 0.0761, + "step": 47310 + }, + { + "epoch": 3.0958456002616943, + "grad_norm": 0.8107147216796875, + "learning_rate": 8.811320837393329e-05, + "loss": 0.069, + "step": 47320 + }, + { + "epoch": 3.0964998364409553, + "grad_norm": 0.881219744682312, + "learning_rate": 8.810726198143949e-05, + "loss": 0.0791, + "step": 47330 + }, + { + "epoch": 3.097154072620216, + "grad_norm": 0.9612583518028259, + "learning_rate": 8.810131430272564e-05, + "loss": 0.0726, + "step": 47340 + }, + { + "epoch": 3.0978083087994768, + "grad_norm": 1.0407850742340088, + "learning_rate": 8.809536533799253e-05, + "loss": 0.0722, + "step": 47350 + }, + { + "epoch": 3.0984625449787373, + "grad_norm": 0.861651599407196, + "learning_rate": 8.808941508744093e-05, + "loss": 0.0808, + "step": 47360 + }, + { + "epoch": 3.099116781157998, + "grad_norm": 0.8344012498855591, + "learning_rate": 8.808346355127166e-05, + "loss": 0.0697, + "step": 47370 + }, + { + "epoch": 3.099771017337259, + "grad_norm": 0.8500309586524963, + "learning_rate": 8.807751072968563e-05, + "loss": 0.0738, + "step": 47380 + }, + { + "epoch": 3.1004252535165193, + "grad_norm": 0.8990835547447205, + "learning_rate": 8.807155662288375e-05, + "loss": 0.0724, + "step": 47390 + }, + { + "epoch": 3.1010794896957803, + "grad_norm": 0.8191524744033813, + "learning_rate": 8.8065601231067e-05, + "loss": 0.0699, + "step": 47400 + }, + { + "epoch": 3.101733725875041, + "grad_norm": 0.9893473982810974, + "learning_rate": 8.805964455443636e-05, + "loss": 0.0728, + "step": 47410 + }, + { + "epoch": 3.102387962054302, + "grad_norm": 0.8191789984703064, + "learning_rate": 8.805368659319291e-05, + "loss": 0.0682, + "step": 47420 + }, + { + "epoch": 3.1030421982335623, + "grad_norm": 0.8927891850471497, + "learning_rate": 8.804772734753773e-05, + "loss": 0.0692, + "step": 47430 + }, + { + "epoch": 3.103696434412823, + "grad_norm": 0.8646766543388367, + "learning_rate": 8.804176681767196e-05, + "loss": 0.0781, + "step": 47440 + }, + { + "epoch": 3.104350670592084, + "grad_norm": 0.9096436500549316, + "learning_rate": 8.803580500379681e-05, + "loss": 0.0854, + "step": 47450 + }, + { + "epoch": 3.1050049067713443, + "grad_norm": 0.8713791370391846, + "learning_rate": 8.802984190611349e-05, + "loss": 0.0833, + "step": 47460 + }, + { + "epoch": 3.1056591429506053, + "grad_norm": 0.9313731789588928, + "learning_rate": 8.802387752482327e-05, + "loss": 0.0796, + "step": 47470 + }, + { + "epoch": 3.106313379129866, + "grad_norm": 0.8143359422683716, + "learning_rate": 8.801791186012744e-05, + "loss": 0.0747, + "step": 47480 + }, + { + "epoch": 3.106967615309127, + "grad_norm": 0.8441165685653687, + "learning_rate": 8.80119449122274e-05, + "loss": 0.0727, + "step": 47490 + }, + { + "epoch": 3.1076218514883873, + "grad_norm": 0.9759175777435303, + "learning_rate": 8.800597668132452e-05, + "loss": 0.0743, + "step": 47500 + }, + { + "epoch": 3.108276087667648, + "grad_norm": 0.8443938493728638, + "learning_rate": 8.800000716762024e-05, + "loss": 0.0739, + "step": 47510 + }, + { + "epoch": 3.108930323846909, + "grad_norm": 0.9399232864379883, + "learning_rate": 8.799403637131609e-05, + "loss": 0.0766, + "step": 47520 + }, + { + "epoch": 3.1095845600261693, + "grad_norm": 0.8907540440559387, + "learning_rate": 8.798806429261355e-05, + "loss": 0.0757, + "step": 47530 + }, + { + "epoch": 3.1102387962054303, + "grad_norm": 0.9284667372703552, + "learning_rate": 8.798209093171421e-05, + "loss": 0.0811, + "step": 47540 + }, + { + "epoch": 3.110893032384691, + "grad_norm": 0.9799740314483643, + "learning_rate": 8.79761162888197e-05, + "loss": 0.0741, + "step": 47550 + }, + { + "epoch": 3.1115472685639514, + "grad_norm": 0.9789896011352539, + "learning_rate": 8.797014036413167e-05, + "loss": 0.0732, + "step": 47560 + }, + { + "epoch": 3.1122015047432123, + "grad_norm": 0.7399867177009583, + "learning_rate": 8.796416315785181e-05, + "loss": 0.0726, + "step": 47570 + }, + { + "epoch": 3.112855740922473, + "grad_norm": 0.9549564719200134, + "learning_rate": 8.795818467018188e-05, + "loss": 0.0777, + "step": 47580 + }, + { + "epoch": 3.113509977101734, + "grad_norm": 0.9988911151885986, + "learning_rate": 8.795220490132369e-05, + "loss": 0.0775, + "step": 47590 + }, + { + "epoch": 3.1141642132809944, + "grad_norm": 0.9814069271087646, + "learning_rate": 8.794622385147903e-05, + "loss": 0.0853, + "step": 47600 + }, + { + "epoch": 3.1148184494602553, + "grad_norm": 0.7243561148643494, + "learning_rate": 8.79402415208498e-05, + "loss": 0.0778, + "step": 47610 + }, + { + "epoch": 3.115472685639516, + "grad_norm": 0.8936898708343506, + "learning_rate": 8.793425790963792e-05, + "loss": 0.0838, + "step": 47620 + }, + { + "epoch": 3.1161269218187764, + "grad_norm": 0.9839106202125549, + "learning_rate": 8.792827301804536e-05, + "loss": 0.0809, + "step": 47630 + }, + { + "epoch": 3.1167811579980373, + "grad_norm": 0.9321417212486267, + "learning_rate": 8.79222868462741e-05, + "loss": 0.0699, + "step": 47640 + }, + { + "epoch": 3.117435394177298, + "grad_norm": 0.9388688206672668, + "learning_rate": 8.791629939452621e-05, + "loss": 0.0872, + "step": 47650 + }, + { + "epoch": 3.118089630356559, + "grad_norm": 0.7993639707565308, + "learning_rate": 8.791031066300378e-05, + "loss": 0.0793, + "step": 47660 + }, + { + "epoch": 3.1187438665358194, + "grad_norm": 0.7926256060600281, + "learning_rate": 8.790432065190892e-05, + "loss": 0.0762, + "step": 47670 + }, + { + "epoch": 3.1193981027150803, + "grad_norm": 0.8904451131820679, + "learning_rate": 8.789832936144386e-05, + "loss": 0.0817, + "step": 47680 + }, + { + "epoch": 3.120052338894341, + "grad_norm": 1.026097059249878, + "learning_rate": 8.789233679181077e-05, + "loss": 0.0745, + "step": 47690 + }, + { + "epoch": 3.1207065750736014, + "grad_norm": 0.8326067328453064, + "learning_rate": 8.788634294321195e-05, + "loss": 0.0662, + "step": 47700 + }, + { + "epoch": 3.1213608112528624, + "grad_norm": 0.9273573160171509, + "learning_rate": 8.788034781584968e-05, + "loss": 0.0782, + "step": 47710 + }, + { + "epoch": 3.122015047432123, + "grad_norm": 0.7090617418289185, + "learning_rate": 8.787435140992635e-05, + "loss": 0.0721, + "step": 47720 + }, + { + "epoch": 3.122669283611384, + "grad_norm": 0.823361873626709, + "learning_rate": 8.786835372564431e-05, + "loss": 0.0743, + "step": 47730 + }, + { + "epoch": 3.1233235197906444, + "grad_norm": 0.8674502372741699, + "learning_rate": 8.786235476320603e-05, + "loss": 0.064, + "step": 47740 + }, + { + "epoch": 3.1239777559699053, + "grad_norm": 0.8977984189987183, + "learning_rate": 8.785635452281397e-05, + "loss": 0.0797, + "step": 47750 + }, + { + "epoch": 3.124631992149166, + "grad_norm": 0.7301965355873108, + "learning_rate": 8.785035300467068e-05, + "loss": 0.0726, + "step": 47760 + }, + { + "epoch": 3.1252862283284264, + "grad_norm": 0.8135896921157837, + "learning_rate": 8.78443502089787e-05, + "loss": 0.0799, + "step": 47770 + }, + { + "epoch": 3.1259404645076874, + "grad_norm": 0.8342558145523071, + "learning_rate": 8.783834613594064e-05, + "loss": 0.0738, + "step": 47780 + }, + { + "epoch": 3.126594700686948, + "grad_norm": 0.7653647065162659, + "learning_rate": 8.783234078575917e-05, + "loss": 0.0768, + "step": 47790 + }, + { + "epoch": 3.127248936866209, + "grad_norm": 0.7020546793937683, + "learning_rate": 8.7826334158637e-05, + "loss": 0.0703, + "step": 47800 + }, + { + "epoch": 3.1279031730454694, + "grad_norm": 0.9205434918403625, + "learning_rate": 8.782032625477681e-05, + "loss": 0.0709, + "step": 47810 + }, + { + "epoch": 3.1285574092247304, + "grad_norm": 0.830418586730957, + "learning_rate": 8.781431707438145e-05, + "loss": 0.0749, + "step": 47820 + }, + { + "epoch": 3.129211645403991, + "grad_norm": 1.1934880018234253, + "learning_rate": 8.780830661765371e-05, + "loss": 0.072, + "step": 47830 + }, + { + "epoch": 3.1298658815832514, + "grad_norm": 0.9115713834762573, + "learning_rate": 8.780229488479646e-05, + "loss": 0.0744, + "step": 47840 + }, + { + "epoch": 3.1305201177625124, + "grad_norm": 0.963766872882843, + "learning_rate": 8.779628187601261e-05, + "loss": 0.0832, + "step": 47850 + }, + { + "epoch": 3.131174353941773, + "grad_norm": 0.7308657765388489, + "learning_rate": 8.779026759150515e-05, + "loss": 0.0826, + "step": 47860 + }, + { + "epoch": 3.131828590121034, + "grad_norm": 0.8448659777641296, + "learning_rate": 8.778425203147703e-05, + "loss": 0.0819, + "step": 47870 + }, + { + "epoch": 3.1324828263002944, + "grad_norm": 1.0432631969451904, + "learning_rate": 8.777823519613131e-05, + "loss": 0.0763, + "step": 47880 + }, + { + "epoch": 3.133137062479555, + "grad_norm": 0.9087404608726501, + "learning_rate": 8.77722170856711e-05, + "loss": 0.0852, + "step": 47890 + }, + { + "epoch": 3.133791298658816, + "grad_norm": 0.8869801163673401, + "learning_rate": 8.776619770029946e-05, + "loss": 0.0716, + "step": 47900 + }, + { + "epoch": 3.1344455348380764, + "grad_norm": 0.7883999943733215, + "learning_rate": 8.776017704021964e-05, + "loss": 0.0694, + "step": 47910 + }, + { + "epoch": 3.1350997710173374, + "grad_norm": 1.1051850318908691, + "learning_rate": 8.77541551056348e-05, + "loss": 0.083, + "step": 47920 + }, + { + "epoch": 3.135754007196598, + "grad_norm": 0.7399940490722656, + "learning_rate": 8.77481318967482e-05, + "loss": 0.0688, + "step": 47930 + }, + { + "epoch": 3.136408243375859, + "grad_norm": 0.8819198608398438, + "learning_rate": 8.774210741376316e-05, + "loss": 0.0813, + "step": 47940 + }, + { + "epoch": 3.1370624795551194, + "grad_norm": 0.8618602156639099, + "learning_rate": 8.773608165688303e-05, + "loss": 0.081, + "step": 47950 + }, + { + "epoch": 3.13771671573438, + "grad_norm": 0.9459993243217468, + "learning_rate": 8.773005462631115e-05, + "loss": 0.072, + "step": 47960 + }, + { + "epoch": 3.138370951913641, + "grad_norm": 0.8384934067726135, + "learning_rate": 8.772402632225098e-05, + "loss": 0.0748, + "step": 47970 + }, + { + "epoch": 3.1390251880929014, + "grad_norm": 0.803860604763031, + "learning_rate": 8.7717996744906e-05, + "loss": 0.0765, + "step": 47980 + }, + { + "epoch": 3.1396794242721624, + "grad_norm": 1.1907633543014526, + "learning_rate": 8.77119658944797e-05, + "loss": 0.0909, + "step": 47990 + }, + { + "epoch": 3.140333660451423, + "grad_norm": 0.8750865459442139, + "learning_rate": 8.770593377117566e-05, + "loss": 0.067, + "step": 48000 + }, + { + "epoch": 3.1409878966306835, + "grad_norm": 0.7471033930778503, + "learning_rate": 8.769990037519747e-05, + "loss": 0.0719, + "step": 48010 + }, + { + "epoch": 3.1416421328099444, + "grad_norm": 1.0200992822647095, + "learning_rate": 8.769386570674876e-05, + "loss": 0.0804, + "step": 48020 + }, + { + "epoch": 3.142296368989205, + "grad_norm": 0.8135073184967041, + "learning_rate": 8.768782976603323e-05, + "loss": 0.0677, + "step": 48030 + }, + { + "epoch": 3.142950605168466, + "grad_norm": 0.8066560626029968, + "learning_rate": 8.76817925532546e-05, + "loss": 0.0816, + "step": 48040 + }, + { + "epoch": 3.1436048413477264, + "grad_norm": 0.9547781348228455, + "learning_rate": 8.767575406861665e-05, + "loss": 0.0732, + "step": 48050 + }, + { + "epoch": 3.1442590775269874, + "grad_norm": 0.9590566754341125, + "learning_rate": 8.766971431232318e-05, + "loss": 0.0797, + "step": 48060 + }, + { + "epoch": 3.144913313706248, + "grad_norm": 0.8913863897323608, + "learning_rate": 8.766367328457808e-05, + "loss": 0.08, + "step": 48070 + }, + { + "epoch": 3.1455675498855085, + "grad_norm": 0.9507361650466919, + "learning_rate": 8.765763098558521e-05, + "loss": 0.0722, + "step": 48080 + }, + { + "epoch": 3.1462217860647694, + "grad_norm": 1.0220569372177124, + "learning_rate": 8.765158741554855e-05, + "loss": 0.083, + "step": 48090 + }, + { + "epoch": 3.14687602224403, + "grad_norm": 0.9265881776809692, + "learning_rate": 8.764554257467207e-05, + "loss": 0.0781, + "step": 48100 + }, + { + "epoch": 3.147530258423291, + "grad_norm": 0.765208899974823, + "learning_rate": 8.763949646315979e-05, + "loss": 0.0691, + "step": 48110 + }, + { + "epoch": 3.1481844946025515, + "grad_norm": 0.7979570627212524, + "learning_rate": 8.76334490812158e-05, + "loss": 0.0834, + "step": 48120 + }, + { + "epoch": 3.1488387307818124, + "grad_norm": 0.6835886240005493, + "learning_rate": 8.76274004290442e-05, + "loss": 0.0679, + "step": 48130 + }, + { + "epoch": 3.149492966961073, + "grad_norm": 0.8190062642097473, + "learning_rate": 8.762135050684915e-05, + "loss": 0.0679, + "step": 48140 + }, + { + "epoch": 3.1501472031403335, + "grad_norm": 0.946603536605835, + "learning_rate": 8.761529931483487e-05, + "loss": 0.0689, + "step": 48150 + }, + { + "epoch": 3.1508014393195944, + "grad_norm": 0.8142273426055908, + "learning_rate": 8.760924685320557e-05, + "loss": 0.0722, + "step": 48160 + }, + { + "epoch": 3.151455675498855, + "grad_norm": 0.7121886610984802, + "learning_rate": 8.760319312216557e-05, + "loss": 0.0683, + "step": 48170 + }, + { + "epoch": 3.152109911678116, + "grad_norm": 0.9496175646781921, + "learning_rate": 8.759713812191917e-05, + "loss": 0.066, + "step": 48180 + }, + { + "epoch": 3.1527641478573765, + "grad_norm": 0.7960578799247742, + "learning_rate": 8.759108185267078e-05, + "loss": 0.0642, + "step": 48190 + }, + { + "epoch": 3.1534183840366374, + "grad_norm": 1.0279724597930908, + "learning_rate": 8.758502431462476e-05, + "loss": 0.0795, + "step": 48200 + }, + { + "epoch": 3.154072620215898, + "grad_norm": 0.8479052186012268, + "learning_rate": 8.757896550798562e-05, + "loss": 0.0718, + "step": 48210 + }, + { + "epoch": 3.1547268563951585, + "grad_norm": 0.7294306755065918, + "learning_rate": 8.757290543295784e-05, + "loss": 0.0758, + "step": 48220 + }, + { + "epoch": 3.1553810925744195, + "grad_norm": 0.7830544710159302, + "learning_rate": 8.756684408974596e-05, + "loss": 0.0777, + "step": 48230 + }, + { + "epoch": 3.15603532875368, + "grad_norm": 0.8489717841148376, + "learning_rate": 8.756078147855455e-05, + "loss": 0.0847, + "step": 48240 + }, + { + "epoch": 3.156689564932941, + "grad_norm": 0.8141524791717529, + "learning_rate": 8.755471759958828e-05, + "loss": 0.074, + "step": 48250 + }, + { + "epoch": 3.1573438011122015, + "grad_norm": 1.0648870468139648, + "learning_rate": 8.754865245305179e-05, + "loss": 0.085, + "step": 48260 + }, + { + "epoch": 3.1579980372914624, + "grad_norm": 0.9387083649635315, + "learning_rate": 8.754258603914982e-05, + "loss": 0.0737, + "step": 48270 + }, + { + "epoch": 3.158652273470723, + "grad_norm": 0.906022846698761, + "learning_rate": 8.75365183580871e-05, + "loss": 0.0797, + "step": 48280 + }, + { + "epoch": 3.1593065096499835, + "grad_norm": 0.9713001847267151, + "learning_rate": 8.753044941006846e-05, + "loss": 0.0766, + "step": 48290 + }, + { + "epoch": 3.1599607458292445, + "grad_norm": 1.0321050882339478, + "learning_rate": 8.75243791952987e-05, + "loss": 0.0814, + "step": 48300 + }, + { + "epoch": 3.160614982008505, + "grad_norm": 0.7994502186775208, + "learning_rate": 8.751830771398272e-05, + "loss": 0.0746, + "step": 48310 + }, + { + "epoch": 3.161269218187766, + "grad_norm": 1.0904173851013184, + "learning_rate": 8.75122349663255e-05, + "loss": 0.0742, + "step": 48320 + }, + { + "epoch": 3.1619234543670265, + "grad_norm": 1.043516755104065, + "learning_rate": 8.750616095253194e-05, + "loss": 0.085, + "step": 48330 + }, + { + "epoch": 3.162577690546287, + "grad_norm": 0.9570246934890747, + "learning_rate": 8.750008567280709e-05, + "loss": 0.0795, + "step": 48340 + }, + { + "epoch": 3.163231926725548, + "grad_norm": 0.9861446022987366, + "learning_rate": 8.749400912735602e-05, + "loss": 0.0799, + "step": 48350 + }, + { + "epoch": 3.1638861629048085, + "grad_norm": 1.004128336906433, + "learning_rate": 8.748793131638379e-05, + "loss": 0.0798, + "step": 48360 + }, + { + "epoch": 3.1645403990840695, + "grad_norm": 0.8075050115585327, + "learning_rate": 8.748185224009558e-05, + "loss": 0.0796, + "step": 48370 + }, + { + "epoch": 3.16519463526333, + "grad_norm": 0.7377058863639832, + "learning_rate": 8.747577189869653e-05, + "loss": 0.0672, + "step": 48380 + }, + { + "epoch": 3.165848871442591, + "grad_norm": 0.8605074882507324, + "learning_rate": 8.746969029239192e-05, + "loss": 0.071, + "step": 48390 + }, + { + "epoch": 3.1665031076218515, + "grad_norm": 0.8945094347000122, + "learning_rate": 8.746360742138698e-05, + "loss": 0.0756, + "step": 48400 + }, + { + "epoch": 3.167157343801112, + "grad_norm": 0.9711698293685913, + "learning_rate": 8.745752328588703e-05, + "loss": 0.0706, + "step": 48410 + }, + { + "epoch": 3.167811579980373, + "grad_norm": 1.2028074264526367, + "learning_rate": 8.745143788609744e-05, + "loss": 0.0807, + "step": 48420 + }, + { + "epoch": 3.1684658161596335, + "grad_norm": 0.8748884201049805, + "learning_rate": 8.744535122222361e-05, + "loss": 0.0808, + "step": 48430 + }, + { + "epoch": 3.1691200523388945, + "grad_norm": 0.8327896595001221, + "learning_rate": 8.743926329447097e-05, + "loss": 0.0734, + "step": 48440 + }, + { + "epoch": 3.169774288518155, + "grad_norm": 0.7776886224746704, + "learning_rate": 8.743317410304501e-05, + "loss": 0.0674, + "step": 48450 + }, + { + "epoch": 3.1704285246974155, + "grad_norm": 0.8404861688613892, + "learning_rate": 8.742708364815125e-05, + "loss": 0.0654, + "step": 48460 + }, + { + "epoch": 3.1710827608766765, + "grad_norm": 0.9241907596588135, + "learning_rate": 8.742099192999525e-05, + "loss": 0.075, + "step": 48470 + }, + { + "epoch": 3.171736997055937, + "grad_norm": 0.7482475638389587, + "learning_rate": 8.741489894878264e-05, + "loss": 0.0702, + "step": 48480 + }, + { + "epoch": 3.172391233235198, + "grad_norm": 0.9275398254394531, + "learning_rate": 8.740880470471907e-05, + "loss": 0.072, + "step": 48490 + }, + { + "epoch": 3.1730454694144585, + "grad_norm": 0.873641848564148, + "learning_rate": 8.740270919801023e-05, + "loss": 0.0783, + "step": 48500 + }, + { + "epoch": 3.1736997055937195, + "grad_norm": 0.8717873692512512, + "learning_rate": 8.739661242886186e-05, + "loss": 0.0775, + "step": 48510 + }, + { + "epoch": 3.17435394177298, + "grad_norm": 0.8635610938072205, + "learning_rate": 8.739051439747973e-05, + "loss": 0.0736, + "step": 48520 + }, + { + "epoch": 3.1750081779522406, + "grad_norm": 1.115180253982544, + "learning_rate": 8.73844151040697e-05, + "loss": 0.0772, + "step": 48530 + }, + { + "epoch": 3.1756624141315015, + "grad_norm": 0.6833996772766113, + "learning_rate": 8.737831454883761e-05, + "loss": 0.0643, + "step": 48540 + }, + { + "epoch": 3.176316650310762, + "grad_norm": 0.8258626461029053, + "learning_rate": 8.737221273198939e-05, + "loss": 0.0809, + "step": 48550 + }, + { + "epoch": 3.176970886490023, + "grad_norm": 0.9197008609771729, + "learning_rate": 8.736610965373095e-05, + "loss": 0.0705, + "step": 48560 + }, + { + "epoch": 3.1776251226692835, + "grad_norm": 0.9148569703102112, + "learning_rate": 8.736000531426833e-05, + "loss": 0.0733, + "step": 48570 + }, + { + "epoch": 3.1782793588485445, + "grad_norm": 0.9025515913963318, + "learning_rate": 8.735389971380755e-05, + "loss": 0.0835, + "step": 48580 + }, + { + "epoch": 3.178933595027805, + "grad_norm": 0.9588202238082886, + "learning_rate": 8.734779285255469e-05, + "loss": 0.0648, + "step": 48590 + }, + { + "epoch": 3.1795878312070656, + "grad_norm": 0.7988367676734924, + "learning_rate": 8.734168473071587e-05, + "loss": 0.083, + "step": 48600 + }, + { + "epoch": 3.1802420673863265, + "grad_norm": 0.7926768064498901, + "learning_rate": 8.733557534849726e-05, + "loss": 0.0718, + "step": 48610 + }, + { + "epoch": 3.180896303565587, + "grad_norm": 1.0557847023010254, + "learning_rate": 8.732946470610506e-05, + "loss": 0.0718, + "step": 48620 + }, + { + "epoch": 3.181550539744848, + "grad_norm": 0.996004045009613, + "learning_rate": 8.732335280374555e-05, + "loss": 0.081, + "step": 48630 + }, + { + "epoch": 3.1822047759241086, + "grad_norm": 0.9401144981384277, + "learning_rate": 8.731723964162498e-05, + "loss": 0.0697, + "step": 48640 + }, + { + "epoch": 3.1828590121033695, + "grad_norm": 0.7425965666770935, + "learning_rate": 8.731112521994969e-05, + "loss": 0.0752, + "step": 48650 + }, + { + "epoch": 3.18351324828263, + "grad_norm": 0.9412823915481567, + "learning_rate": 8.730500953892609e-05, + "loss": 0.07, + "step": 48660 + }, + { + "epoch": 3.1841674844618906, + "grad_norm": 0.7894400358200073, + "learning_rate": 8.729889259876057e-05, + "loss": 0.0686, + "step": 48670 + }, + { + "epoch": 3.1848217206411515, + "grad_norm": 0.8867034912109375, + "learning_rate": 8.729277439965962e-05, + "loss": 0.081, + "step": 48680 + }, + { + "epoch": 3.185475956820412, + "grad_norm": 0.8769906163215637, + "learning_rate": 8.728665494182971e-05, + "loss": 0.0688, + "step": 48690 + }, + { + "epoch": 3.186130192999673, + "grad_norm": 0.8141859173774719, + "learning_rate": 8.728053422547743e-05, + "loss": 0.0803, + "step": 48700 + }, + { + "epoch": 3.1867844291789336, + "grad_norm": 0.8036234974861145, + "learning_rate": 8.727441225080934e-05, + "loss": 0.0813, + "step": 48710 + }, + { + "epoch": 3.1874386653581945, + "grad_norm": 0.7681657075881958, + "learning_rate": 8.726828901803207e-05, + "loss": 0.0797, + "step": 48720 + }, + { + "epoch": 3.188092901537455, + "grad_norm": 0.9178977608680725, + "learning_rate": 8.726216452735232e-05, + "loss": 0.0743, + "step": 48730 + }, + { + "epoch": 3.1887471377167156, + "grad_norm": 0.8207674026489258, + "learning_rate": 8.72560387789768e-05, + "loss": 0.0692, + "step": 48740 + }, + { + "epoch": 3.1894013738959766, + "grad_norm": 0.7519068121910095, + "learning_rate": 8.724991177311224e-05, + "loss": 0.0789, + "step": 48750 + }, + { + "epoch": 3.190055610075237, + "grad_norm": 0.8407567143440247, + "learning_rate": 8.724378350996549e-05, + "loss": 0.0686, + "step": 48760 + }, + { + "epoch": 3.190709846254498, + "grad_norm": 0.6936128735542297, + "learning_rate": 8.723765398974335e-05, + "loss": 0.076, + "step": 48770 + }, + { + "epoch": 3.1913640824337586, + "grad_norm": 0.7411690354347229, + "learning_rate": 8.723152321265275e-05, + "loss": 0.077, + "step": 48780 + }, + { + "epoch": 3.192018318613019, + "grad_norm": 0.9644950032234192, + "learning_rate": 8.722539117890058e-05, + "loss": 0.0803, + "step": 48790 + }, + { + "epoch": 3.19267255479228, + "grad_norm": 0.9315986633300781, + "learning_rate": 8.721925788869383e-05, + "loss": 0.0792, + "step": 48800 + }, + { + "epoch": 3.1933267909715406, + "grad_norm": 0.8755950331687927, + "learning_rate": 8.721312334223952e-05, + "loss": 0.0743, + "step": 48810 + }, + { + "epoch": 3.1939810271508016, + "grad_norm": 0.7426033616065979, + "learning_rate": 8.720698753974473e-05, + "loss": 0.0711, + "step": 48820 + }, + { + "epoch": 3.194635263330062, + "grad_norm": 0.8731258511543274, + "learning_rate": 8.720085048141649e-05, + "loss": 0.0696, + "step": 48830 + }, + { + "epoch": 3.195289499509323, + "grad_norm": 0.9991781711578369, + "learning_rate": 8.7194712167462e-05, + "loss": 0.0847, + "step": 48840 + }, + { + "epoch": 3.1959437356885836, + "grad_norm": 0.78664630651474, + "learning_rate": 8.718857259808843e-05, + "loss": 0.0788, + "step": 48850 + }, + { + "epoch": 3.196597971867844, + "grad_norm": 0.8035576939582825, + "learning_rate": 8.7182431773503e-05, + "loss": 0.0652, + "step": 48860 + }, + { + "epoch": 3.197252208047105, + "grad_norm": 0.9748409390449524, + "learning_rate": 8.717628969391298e-05, + "loss": 0.0878, + "step": 48870 + }, + { + "epoch": 3.1979064442263656, + "grad_norm": 0.9272085428237915, + "learning_rate": 8.717014635952569e-05, + "loss": 0.0736, + "step": 48880 + }, + { + "epoch": 3.1985606804056266, + "grad_norm": 0.7096579670906067, + "learning_rate": 8.716400177054849e-05, + "loss": 0.0637, + "step": 48890 + }, + { + "epoch": 3.199214916584887, + "grad_norm": 0.9682911038398743, + "learning_rate": 8.715785592718875e-05, + "loss": 0.0681, + "step": 48900 + }, + { + "epoch": 3.1998691527641476, + "grad_norm": 0.7501932382583618, + "learning_rate": 8.715170882965391e-05, + "loss": 0.0706, + "step": 48910 + }, + { + "epoch": 3.2005233889434086, + "grad_norm": 0.8990116715431213, + "learning_rate": 8.714556047815147e-05, + "loss": 0.0719, + "step": 48920 + }, + { + "epoch": 3.201177625122669, + "grad_norm": 0.8213568925857544, + "learning_rate": 8.713941087288897e-05, + "loss": 0.0646, + "step": 48930 + }, + { + "epoch": 3.20183186130193, + "grad_norm": 0.9337528944015503, + "learning_rate": 8.713326001407393e-05, + "loss": 0.0844, + "step": 48940 + }, + { + "epoch": 3.2024860974811906, + "grad_norm": 0.7658262848854065, + "learning_rate": 8.712710790191399e-05, + "loss": 0.0723, + "step": 48950 + }, + { + "epoch": 3.2031403336604516, + "grad_norm": 0.803835391998291, + "learning_rate": 8.712095453661677e-05, + "loss": 0.0714, + "step": 48960 + }, + { + "epoch": 3.203794569839712, + "grad_norm": 0.8985826373100281, + "learning_rate": 8.711479991839e-05, + "loss": 0.0761, + "step": 48970 + }, + { + "epoch": 3.2044488060189726, + "grad_norm": 0.8591262698173523, + "learning_rate": 8.710864404744139e-05, + "loss": 0.0807, + "step": 48980 + }, + { + "epoch": 3.2051030421982336, + "grad_norm": 0.8583003878593445, + "learning_rate": 8.710248692397872e-05, + "loss": 0.0725, + "step": 48990 + }, + { + "epoch": 3.205757278377494, + "grad_norm": 0.8409457802772522, + "learning_rate": 8.709632854820982e-05, + "loss": 0.0805, + "step": 49000 + }, + { + "epoch": 3.206411514556755, + "grad_norm": 0.8898982405662537, + "learning_rate": 8.709016892034252e-05, + "loss": 0.0737, + "step": 49010 + }, + { + "epoch": 3.2070657507360156, + "grad_norm": 0.8796902298927307, + "learning_rate": 8.708400804058478e-05, + "loss": 0.0786, + "step": 49020 + }, + { + "epoch": 3.2077199869152766, + "grad_norm": 0.9222922325134277, + "learning_rate": 8.70778459091445e-05, + "loss": 0.0753, + "step": 49030 + }, + { + "epoch": 3.208374223094537, + "grad_norm": 0.9596317410469055, + "learning_rate": 8.707168252622966e-05, + "loss": 0.0713, + "step": 49040 + }, + { + "epoch": 3.2090284592737977, + "grad_norm": 0.9624839425086975, + "learning_rate": 8.706551789204833e-05, + "loss": 0.0772, + "step": 49050 + }, + { + "epoch": 3.2096826954530586, + "grad_norm": 0.9401121139526367, + "learning_rate": 8.705935200680854e-05, + "loss": 0.0724, + "step": 49060 + }, + { + "epoch": 3.210336931632319, + "grad_norm": 0.7766807675361633, + "learning_rate": 8.705318487071846e-05, + "loss": 0.0691, + "step": 49070 + }, + { + "epoch": 3.21099116781158, + "grad_norm": 0.8511127233505249, + "learning_rate": 8.704701648398621e-05, + "loss": 0.0736, + "step": 49080 + }, + { + "epoch": 3.2116454039908406, + "grad_norm": 0.9326490163803101, + "learning_rate": 8.704084684681998e-05, + "loss": 0.0704, + "step": 49090 + }, + { + "epoch": 3.2122996401701016, + "grad_norm": 1.0042059421539307, + "learning_rate": 8.703467595942803e-05, + "loss": 0.0755, + "step": 49100 + }, + { + "epoch": 3.212953876349362, + "grad_norm": 0.7557867169380188, + "learning_rate": 8.702850382201863e-05, + "loss": 0.0707, + "step": 49110 + }, + { + "epoch": 3.2136081125286227, + "grad_norm": 1.0037438869476318, + "learning_rate": 8.702233043480015e-05, + "loss": 0.0776, + "step": 49120 + }, + { + "epoch": 3.2142623487078836, + "grad_norm": 0.7470929026603699, + "learning_rate": 8.701615579798089e-05, + "loss": 0.0707, + "step": 49130 + }, + { + "epoch": 3.214916584887144, + "grad_norm": 0.8272511959075928, + "learning_rate": 8.70099799117693e-05, + "loss": 0.0767, + "step": 49140 + }, + { + "epoch": 3.215570821066405, + "grad_norm": 1.0610016584396362, + "learning_rate": 8.700380277637384e-05, + "loss": 0.0749, + "step": 49150 + }, + { + "epoch": 3.2162250572456657, + "grad_norm": 0.7501885890960693, + "learning_rate": 8.699762439200298e-05, + "loss": 0.0777, + "step": 49160 + }, + { + "epoch": 3.2168792934249266, + "grad_norm": 0.8301616311073303, + "learning_rate": 8.699144475886526e-05, + "loss": 0.0798, + "step": 49170 + }, + { + "epoch": 3.217533529604187, + "grad_norm": 0.8588626384735107, + "learning_rate": 8.698526387716928e-05, + "loss": 0.0745, + "step": 49180 + }, + { + "epoch": 3.2181877657834477, + "grad_norm": 1.1801152229309082, + "learning_rate": 8.697908174712363e-05, + "loss": 0.0882, + "step": 49190 + }, + { + "epoch": 3.2188420019627086, + "grad_norm": 0.9209977984428406, + "learning_rate": 8.6972898368937e-05, + "loss": 0.0757, + "step": 49200 + }, + { + "epoch": 3.219496238141969, + "grad_norm": 1.0483894348144531, + "learning_rate": 8.696671374281808e-05, + "loss": 0.0722, + "step": 49210 + }, + { + "epoch": 3.22015047432123, + "grad_norm": 0.9337593913078308, + "learning_rate": 8.696052786897563e-05, + "loss": 0.075, + "step": 49220 + }, + { + "epoch": 3.2208047105004907, + "grad_norm": 0.8857582807540894, + "learning_rate": 8.695434074761843e-05, + "loss": 0.0744, + "step": 49230 + }, + { + "epoch": 3.221458946679751, + "grad_norm": 0.8099195957183838, + "learning_rate": 8.69481523789553e-05, + "loss": 0.072, + "step": 49240 + }, + { + "epoch": 3.222113182859012, + "grad_norm": 0.8427746295928955, + "learning_rate": 8.694196276319514e-05, + "loss": 0.0747, + "step": 49250 + }, + { + "epoch": 3.2227674190382727, + "grad_norm": 0.8717663884162903, + "learning_rate": 8.693577190054685e-05, + "loss": 0.0792, + "step": 49260 + }, + { + "epoch": 3.2234216552175337, + "grad_norm": 0.8552377820014954, + "learning_rate": 8.69295797912194e-05, + "loss": 0.0777, + "step": 49270 + }, + { + "epoch": 3.224075891396794, + "grad_norm": 0.8015262484550476, + "learning_rate": 8.692338643542177e-05, + "loss": 0.0846, + "step": 49280 + }, + { + "epoch": 3.224730127576055, + "grad_norm": 0.7176341414451599, + "learning_rate": 8.691719183336302e-05, + "loss": 0.0658, + "step": 49290 + }, + { + "epoch": 3.2253843637553157, + "grad_norm": 0.8882294297218323, + "learning_rate": 8.69109959852522e-05, + "loss": 0.0737, + "step": 49300 + }, + { + "epoch": 3.226038599934576, + "grad_norm": 0.9010905623435974, + "learning_rate": 8.69047988912985e-05, + "loss": 0.0838, + "step": 49310 + }, + { + "epoch": 3.226692836113837, + "grad_norm": 0.8391245007514954, + "learning_rate": 8.689860055171104e-05, + "loss": 0.0727, + "step": 49320 + }, + { + "epoch": 3.2273470722930977, + "grad_norm": 1.213536024093628, + "learning_rate": 8.689240096669903e-05, + "loss": 0.0761, + "step": 49330 + }, + { + "epoch": 3.2280013084723587, + "grad_norm": 0.9631514549255371, + "learning_rate": 8.688620013647175e-05, + "loss": 0.0703, + "step": 49340 + }, + { + "epoch": 3.228655544651619, + "grad_norm": 0.8843837380409241, + "learning_rate": 8.687999806123847e-05, + "loss": 0.0657, + "step": 49350 + }, + { + "epoch": 3.2293097808308797, + "grad_norm": 0.8727587461471558, + "learning_rate": 8.687379474120852e-05, + "loss": 0.085, + "step": 49360 + }, + { + "epoch": 3.2299640170101407, + "grad_norm": 0.8236467838287354, + "learning_rate": 8.686759017659132e-05, + "loss": 0.0652, + "step": 49370 + }, + { + "epoch": 3.230618253189401, + "grad_norm": 0.876015841960907, + "learning_rate": 8.686138436759623e-05, + "loss": 0.0741, + "step": 49380 + }, + { + "epoch": 3.231272489368662, + "grad_norm": 0.8100466132164001, + "learning_rate": 8.685517731443278e-05, + "loss": 0.0748, + "step": 49390 + }, + { + "epoch": 3.2319267255479227, + "grad_norm": 0.8338639140129089, + "learning_rate": 8.684896901731041e-05, + "loss": 0.0731, + "step": 49400 + }, + { + "epoch": 3.2325809617271837, + "grad_norm": 0.8085967302322388, + "learning_rate": 8.684275947643872e-05, + "loss": 0.075, + "step": 49410 + }, + { + "epoch": 3.233235197906444, + "grad_norm": 0.8683192729949951, + "learning_rate": 8.683654869202726e-05, + "loss": 0.072, + "step": 49420 + }, + { + "epoch": 3.2338894340857047, + "grad_norm": 1.1093887090682983, + "learning_rate": 8.683033666428568e-05, + "loss": 0.0715, + "step": 49430 + }, + { + "epoch": 3.2345436702649657, + "grad_norm": 1.2019805908203125, + "learning_rate": 8.682412339342363e-05, + "loss": 0.0825, + "step": 49440 + }, + { + "epoch": 3.2351979064442262, + "grad_norm": 0.8940714001655579, + "learning_rate": 8.681790887965087e-05, + "loss": 0.0761, + "step": 49450 + }, + { + "epoch": 3.235852142623487, + "grad_norm": 0.9425767660140991, + "learning_rate": 8.681169312317709e-05, + "loss": 0.0733, + "step": 49460 + }, + { + "epoch": 3.2365063788027477, + "grad_norm": 0.9423227310180664, + "learning_rate": 8.680547612421215e-05, + "loss": 0.0888, + "step": 49470 + }, + { + "epoch": 3.2371606149820087, + "grad_norm": 1.0191290378570557, + "learning_rate": 8.679925788296586e-05, + "loss": 0.0749, + "step": 49480 + }, + { + "epoch": 3.237814851161269, + "grad_norm": 0.8777630925178528, + "learning_rate": 8.67930383996481e-05, + "loss": 0.083, + "step": 49490 + }, + { + "epoch": 3.2384690873405297, + "grad_norm": 1.0613380670547485, + "learning_rate": 8.678681767446882e-05, + "loss": 0.0801, + "step": 49500 + }, + { + "epoch": 3.2391233235197907, + "grad_norm": 0.8227213621139526, + "learning_rate": 8.678059570763794e-05, + "loss": 0.0692, + "step": 49510 + }, + { + "epoch": 3.2397775596990512, + "grad_norm": 0.8650736212730408, + "learning_rate": 8.677437249936552e-05, + "loss": 0.0787, + "step": 49520 + }, + { + "epoch": 3.240431795878312, + "grad_norm": 0.8299550414085388, + "learning_rate": 8.676814804986158e-05, + "loss": 0.0721, + "step": 49530 + }, + { + "epoch": 3.2410860320575727, + "grad_norm": 1.0536143779754639, + "learning_rate": 8.67619223593362e-05, + "loss": 0.074, + "step": 49540 + }, + { + "epoch": 3.2417402682368337, + "grad_norm": 0.9724963903427124, + "learning_rate": 8.675569542799953e-05, + "loss": 0.0775, + "step": 49550 + }, + { + "epoch": 3.2423945044160942, + "grad_norm": 1.0580730438232422, + "learning_rate": 8.674946725606176e-05, + "loss": 0.0856, + "step": 49560 + }, + { + "epoch": 3.2430487405953548, + "grad_norm": 0.9586568474769592, + "learning_rate": 8.674323784373308e-05, + "loss": 0.0783, + "step": 49570 + }, + { + "epoch": 3.2437029767746157, + "grad_norm": 0.8590267300605774, + "learning_rate": 8.673700719122375e-05, + "loss": 0.0748, + "step": 49580 + }, + { + "epoch": 3.2443572129538762, + "grad_norm": 0.9745254516601562, + "learning_rate": 8.673077529874409e-05, + "loss": 0.0857, + "step": 49590 + }, + { + "epoch": 3.245011449133137, + "grad_norm": 0.7989119291305542, + "learning_rate": 8.672454216650445e-05, + "loss": 0.0696, + "step": 49600 + }, + { + "epoch": 3.2456656853123977, + "grad_norm": 0.9528154730796814, + "learning_rate": 8.671830779471518e-05, + "loss": 0.082, + "step": 49610 + }, + { + "epoch": 3.2463199214916587, + "grad_norm": 0.8685365319252014, + "learning_rate": 8.671207218358672e-05, + "loss": 0.0709, + "step": 49620 + }, + { + "epoch": 3.2469741576709192, + "grad_norm": 0.7714129090309143, + "learning_rate": 8.670583533332957e-05, + "loss": 0.078, + "step": 49630 + }, + { + "epoch": 3.2476283938501798, + "grad_norm": 0.865516722202301, + "learning_rate": 8.669959724415419e-05, + "loss": 0.0676, + "step": 49640 + }, + { + "epoch": 3.2482826300294407, + "grad_norm": 0.8643177151679993, + "learning_rate": 8.669335791627116e-05, + "loss": 0.0699, + "step": 49650 + }, + { + "epoch": 3.2489368662087013, + "grad_norm": 0.9267588257789612, + "learning_rate": 8.668711734989105e-05, + "loss": 0.0746, + "step": 49660 + }, + { + "epoch": 3.2495911023879622, + "grad_norm": 1.2003788948059082, + "learning_rate": 8.668087554522455e-05, + "loss": 0.0673, + "step": 49670 + }, + { + "epoch": 3.2502453385672228, + "grad_norm": 0.869156002998352, + "learning_rate": 8.667463250248228e-05, + "loss": 0.0709, + "step": 49680 + }, + { + "epoch": 3.2508995747464837, + "grad_norm": 1.0042074918746948, + "learning_rate": 8.666838822187498e-05, + "loss": 0.0782, + "step": 49690 + }, + { + "epoch": 3.2515538109257442, + "grad_norm": 0.9066942930221558, + "learning_rate": 8.666214270361342e-05, + "loss": 0.0717, + "step": 49700 + }, + { + "epoch": 3.2522080471050048, + "grad_norm": 0.8262027502059937, + "learning_rate": 8.665589594790838e-05, + "loss": 0.0818, + "step": 49710 + }, + { + "epoch": 3.2528622832842657, + "grad_norm": 0.7925301194190979, + "learning_rate": 8.664964795497073e-05, + "loss": 0.0704, + "step": 49720 + }, + { + "epoch": 3.2535165194635263, + "grad_norm": 0.8457410335540771, + "learning_rate": 8.664339872501133e-05, + "loss": 0.0743, + "step": 49730 + }, + { + "epoch": 3.2541707556427872, + "grad_norm": 1.0192575454711914, + "learning_rate": 8.663714825824114e-05, + "loss": 0.0792, + "step": 49740 + }, + { + "epoch": 3.2548249918220478, + "grad_norm": 0.887332558631897, + "learning_rate": 8.66308965548711e-05, + "loss": 0.0744, + "step": 49750 + }, + { + "epoch": 3.2554792280013083, + "grad_norm": 0.9287620782852173, + "learning_rate": 8.662464361511224e-05, + "loss": 0.0757, + "step": 49760 + }, + { + "epoch": 3.2561334641805693, + "grad_norm": 0.9490354657173157, + "learning_rate": 8.661838943917561e-05, + "loss": 0.0802, + "step": 49770 + }, + { + "epoch": 3.25678770035983, + "grad_norm": 0.7195919752120972, + "learning_rate": 8.661213402727229e-05, + "loss": 0.0737, + "step": 49780 + }, + { + "epoch": 3.2574419365390908, + "grad_norm": 0.7042505145072937, + "learning_rate": 8.660587737961344e-05, + "loss": 0.0644, + "step": 49790 + }, + { + "epoch": 3.2580961727183513, + "grad_norm": 0.7407015562057495, + "learning_rate": 8.659961949641023e-05, + "loss": 0.067, + "step": 49800 + }, + { + "epoch": 3.258750408897612, + "grad_norm": 0.7478853464126587, + "learning_rate": 8.659336037787384e-05, + "loss": 0.0737, + "step": 49810 + }, + { + "epoch": 3.2594046450768728, + "grad_norm": 0.8127663135528564, + "learning_rate": 8.658710002421561e-05, + "loss": 0.0697, + "step": 49820 + }, + { + "epoch": 3.2600588812561333, + "grad_norm": 1.1231528520584106, + "learning_rate": 8.658083843564677e-05, + "loss": 0.0831, + "step": 49830 + }, + { + "epoch": 3.2607131174353943, + "grad_norm": 0.6892096400260925, + "learning_rate": 8.657457561237871e-05, + "loss": 0.0696, + "step": 49840 + }, + { + "epoch": 3.261367353614655, + "grad_norm": 0.8850003480911255, + "learning_rate": 8.656831155462281e-05, + "loss": 0.0721, + "step": 49850 + }, + { + "epoch": 3.2620215897939158, + "grad_norm": 0.7798917889595032, + "learning_rate": 8.656204626259048e-05, + "loss": 0.0768, + "step": 49860 + }, + { + "epoch": 3.2626758259731763, + "grad_norm": 0.934593915939331, + "learning_rate": 8.655577973649321e-05, + "loss": 0.0803, + "step": 49870 + }, + { + "epoch": 3.263330062152437, + "grad_norm": 0.9441463947296143, + "learning_rate": 8.65495119765425e-05, + "loss": 0.0735, + "step": 49880 + }, + { + "epoch": 3.263984298331698, + "grad_norm": 0.6787394881248474, + "learning_rate": 8.65432429829499e-05, + "loss": 0.0688, + "step": 49890 + }, + { + "epoch": 3.2646385345109583, + "grad_norm": 0.8261919021606445, + "learning_rate": 8.653697275592702e-05, + "loss": 0.0669, + "step": 49900 + }, + { + "epoch": 3.2652927706902193, + "grad_norm": 1.0204743146896362, + "learning_rate": 8.653070129568548e-05, + "loss": 0.0691, + "step": 49910 + }, + { + "epoch": 3.26594700686948, + "grad_norm": 1.0907118320465088, + "learning_rate": 8.652442860243698e-05, + "loss": 0.0808, + "step": 49920 + }, + { + "epoch": 3.2666012430487408, + "grad_norm": 1.026333212852478, + "learning_rate": 8.651815467639321e-05, + "loss": 0.0761, + "step": 49930 + }, + { + "epoch": 3.2672554792280013, + "grad_norm": 0.9486099481582642, + "learning_rate": 8.651187951776593e-05, + "loss": 0.0708, + "step": 49940 + }, + { + "epoch": 3.267909715407262, + "grad_norm": 0.9680077433586121, + "learning_rate": 8.6505603126767e-05, + "loss": 0.0742, + "step": 49950 + }, + { + "epoch": 3.268563951586523, + "grad_norm": 0.789247989654541, + "learning_rate": 8.649932550360821e-05, + "loss": 0.0803, + "step": 49960 + }, + { + "epoch": 3.2692181877657833, + "grad_norm": 0.725689709186554, + "learning_rate": 8.649304664850145e-05, + "loss": 0.0762, + "step": 49970 + }, + { + "epoch": 3.2698724239450443, + "grad_norm": 0.7912468314170837, + "learning_rate": 8.648676656165867e-05, + "loss": 0.0754, + "step": 49980 + }, + { + "epoch": 3.270526660124305, + "grad_norm": 0.6831502318382263, + "learning_rate": 8.648048524329182e-05, + "loss": 0.0736, + "step": 49990 + }, + { + "epoch": 3.271180896303566, + "grad_norm": 0.8818649053573608, + "learning_rate": 8.647420269361294e-05, + "loss": 0.0841, + "step": 50000 + }, + { + "epoch": 3.2718351324828263, + "grad_norm": 0.8740116953849792, + "learning_rate": 8.646791891283403e-05, + "loss": 0.07, + "step": 50010 + }, + { + "epoch": 3.272489368662087, + "grad_norm": 0.8219292759895325, + "learning_rate": 8.646163390116723e-05, + "loss": 0.0731, + "step": 50020 + }, + { + "epoch": 3.273143604841348, + "grad_norm": 0.7342281341552734, + "learning_rate": 8.645534765882469e-05, + "loss": 0.0729, + "step": 50030 + }, + { + "epoch": 3.2737978410206083, + "grad_norm": 0.6889681220054626, + "learning_rate": 8.644906018601852e-05, + "loss": 0.0697, + "step": 50040 + }, + { + "epoch": 3.2744520771998693, + "grad_norm": 0.7851772904396057, + "learning_rate": 8.6442771482961e-05, + "loss": 0.0777, + "step": 50050 + }, + { + "epoch": 3.27510631337913, + "grad_norm": 0.9841508865356445, + "learning_rate": 8.643648154986435e-05, + "loss": 0.0747, + "step": 50060 + }, + { + "epoch": 3.275760549558391, + "grad_norm": 0.8496854305267334, + "learning_rate": 8.64301903869409e-05, + "loss": 0.0708, + "step": 50070 + }, + { + "epoch": 3.2764147857376513, + "grad_norm": 0.8300930261611938, + "learning_rate": 8.642389799440298e-05, + "loss": 0.0692, + "step": 50080 + }, + { + "epoch": 3.277069021916912, + "grad_norm": 0.7653340697288513, + "learning_rate": 8.641760437246297e-05, + "loss": 0.0751, + "step": 50090 + }, + { + "epoch": 3.277723258096173, + "grad_norm": 0.7286269068717957, + "learning_rate": 8.641130952133332e-05, + "loss": 0.0665, + "step": 50100 + }, + { + "epoch": 3.2783774942754333, + "grad_norm": 0.9306923151016235, + "learning_rate": 8.64050134412265e-05, + "loss": 0.0827, + "step": 50110 + }, + { + "epoch": 3.2790317304546943, + "grad_norm": 0.7376532554626465, + "learning_rate": 8.639871613235495e-05, + "loss": 0.072, + "step": 50120 + }, + { + "epoch": 3.279685966633955, + "grad_norm": 0.8583968877792358, + "learning_rate": 8.639241759493131e-05, + "loss": 0.0756, + "step": 50130 + }, + { + "epoch": 3.280340202813216, + "grad_norm": 0.9432209134101868, + "learning_rate": 8.638611782916812e-05, + "loss": 0.0757, + "step": 50140 + }, + { + "epoch": 3.2809944389924763, + "grad_norm": 0.8525569438934326, + "learning_rate": 8.637981683527803e-05, + "loss": 0.0743, + "step": 50150 + }, + { + "epoch": 3.281648675171737, + "grad_norm": 1.0892137289047241, + "learning_rate": 8.637351461347371e-05, + "loss": 0.0712, + "step": 50160 + }, + { + "epoch": 3.282302911350998, + "grad_norm": 0.8420467376708984, + "learning_rate": 8.636721116396787e-05, + "loss": 0.0643, + "step": 50170 + }, + { + "epoch": 3.2829571475302584, + "grad_norm": 0.7865369319915771, + "learning_rate": 8.636090648697329e-05, + "loss": 0.0713, + "step": 50180 + }, + { + "epoch": 3.2836113837095193, + "grad_norm": 0.8081366419792175, + "learning_rate": 8.635460058270274e-05, + "loss": 0.0859, + "step": 50190 + }, + { + "epoch": 3.28426561988878, + "grad_norm": 0.8444221019744873, + "learning_rate": 8.63482934513691e-05, + "loss": 0.0721, + "step": 50200 + }, + { + "epoch": 3.2849198560680404, + "grad_norm": 0.9457107782363892, + "learning_rate": 8.634198509318521e-05, + "loss": 0.0678, + "step": 50210 + }, + { + "epoch": 3.2855740922473013, + "grad_norm": 0.908445417881012, + "learning_rate": 8.633567550836403e-05, + "loss": 0.0757, + "step": 50220 + }, + { + "epoch": 3.286228328426562, + "grad_norm": 0.9582734107971191, + "learning_rate": 8.63293646971185e-05, + "loss": 0.0725, + "step": 50230 + }, + { + "epoch": 3.286882564605823, + "grad_norm": 1.1066676378250122, + "learning_rate": 8.632305265966163e-05, + "loss": 0.0838, + "step": 50240 + }, + { + "epoch": 3.2875368007850834, + "grad_norm": 1.1428422927856445, + "learning_rate": 8.631673939620646e-05, + "loss": 0.0725, + "step": 50250 + }, + { + "epoch": 3.288191036964344, + "grad_norm": 0.8173267841339111, + "learning_rate": 8.631042490696612e-05, + "loss": 0.0736, + "step": 50260 + }, + { + "epoch": 3.288845273143605, + "grad_norm": 0.7522175908088684, + "learning_rate": 8.63041091921537e-05, + "loss": 0.067, + "step": 50270 + }, + { + "epoch": 3.2894995093228654, + "grad_norm": 0.7062971591949463, + "learning_rate": 8.629779225198238e-05, + "loss": 0.0683, + "step": 50280 + }, + { + "epoch": 3.2901537455021264, + "grad_norm": 0.9391506314277649, + "learning_rate": 8.629147408666537e-05, + "loss": 0.0705, + "step": 50290 + }, + { + "epoch": 3.290807981681387, + "grad_norm": 0.7575945854187012, + "learning_rate": 8.628515469641593e-05, + "loss": 0.0696, + "step": 50300 + }, + { + "epoch": 3.291462217860648, + "grad_norm": 0.7844806909561157, + "learning_rate": 8.627883408144737e-05, + "loss": 0.0746, + "step": 50310 + }, + { + "epoch": 3.2921164540399084, + "grad_norm": 0.7494459748268127, + "learning_rate": 8.627251224197302e-05, + "loss": 0.0704, + "step": 50320 + }, + { + "epoch": 3.292770690219169, + "grad_norm": 0.7377359867095947, + "learning_rate": 8.626618917820624e-05, + "loss": 0.0679, + "step": 50330 + }, + { + "epoch": 3.29342492639843, + "grad_norm": 0.7377490401268005, + "learning_rate": 8.625986489036048e-05, + "loss": 0.076, + "step": 50340 + }, + { + "epoch": 3.2940791625776904, + "grad_norm": 0.7678962349891663, + "learning_rate": 8.625353937864917e-05, + "loss": 0.071, + "step": 50350 + }, + { + "epoch": 3.2947333987569514, + "grad_norm": 0.9204226136207581, + "learning_rate": 8.624721264328584e-05, + "loss": 0.0675, + "step": 50360 + }, + { + "epoch": 3.295387634936212, + "grad_norm": 0.8501763343811035, + "learning_rate": 8.624088468448401e-05, + "loss": 0.0716, + "step": 50370 + }, + { + "epoch": 3.296041871115473, + "grad_norm": 1.0551090240478516, + "learning_rate": 8.623455550245727e-05, + "loss": 0.0702, + "step": 50380 + }, + { + "epoch": 3.2966961072947334, + "grad_norm": 0.9518636465072632, + "learning_rate": 8.622822509741928e-05, + "loss": 0.0756, + "step": 50390 + }, + { + "epoch": 3.297350343473994, + "grad_norm": 0.971017062664032, + "learning_rate": 8.622189346958365e-05, + "loss": 0.069, + "step": 50400 + }, + { + "epoch": 3.298004579653255, + "grad_norm": 0.8658537268638611, + "learning_rate": 8.621556061916414e-05, + "loss": 0.0779, + "step": 50410 + }, + { + "epoch": 3.2986588158325154, + "grad_norm": 0.9639972448348999, + "learning_rate": 8.620922654637446e-05, + "loss": 0.0717, + "step": 50420 + }, + { + "epoch": 3.2993130520117764, + "grad_norm": 0.9584673643112183, + "learning_rate": 8.620289125142845e-05, + "loss": 0.0691, + "step": 50430 + }, + { + "epoch": 3.299967288191037, + "grad_norm": 0.7982688546180725, + "learning_rate": 8.61965547345399e-05, + "loss": 0.0863, + "step": 50440 + }, + { + "epoch": 3.300621524370298, + "grad_norm": 1.0012849569320679, + "learning_rate": 8.619021699592271e-05, + "loss": 0.0813, + "step": 50450 + }, + { + "epoch": 3.3012757605495584, + "grad_norm": 0.9669886231422424, + "learning_rate": 8.618387803579076e-05, + "loss": 0.0789, + "step": 50460 + }, + { + "epoch": 3.301929996728819, + "grad_norm": 0.7313616871833801, + "learning_rate": 8.617753785435804e-05, + "loss": 0.0641, + "step": 50470 + }, + { + "epoch": 3.30258423290808, + "grad_norm": 0.7947266697883606, + "learning_rate": 8.617119645183856e-05, + "loss": 0.0751, + "step": 50480 + }, + { + "epoch": 3.3032384690873404, + "grad_norm": 0.8369613289833069, + "learning_rate": 8.616485382844631e-05, + "loss": 0.0857, + "step": 50490 + }, + { + "epoch": 3.3038927052666014, + "grad_norm": 1.0216795206069946, + "learning_rate": 8.615850998439542e-05, + "loss": 0.0753, + "step": 50500 + }, + { + "epoch": 3.304546941445862, + "grad_norm": 0.9088335633277893, + "learning_rate": 8.615216491989997e-05, + "loss": 0.0789, + "step": 50510 + }, + { + "epoch": 3.305201177625123, + "grad_norm": 0.9669744372367859, + "learning_rate": 8.614581863517414e-05, + "loss": 0.0717, + "step": 50520 + }, + { + "epoch": 3.3058554138043834, + "grad_norm": 0.8303861021995544, + "learning_rate": 8.613947113043215e-05, + "loss": 0.0739, + "step": 50530 + }, + { + "epoch": 3.306509649983644, + "grad_norm": 0.8657481074333191, + "learning_rate": 8.613312240588822e-05, + "loss": 0.0677, + "step": 50540 + }, + { + "epoch": 3.307163886162905, + "grad_norm": 0.8606223464012146, + "learning_rate": 8.612677246175665e-05, + "loss": 0.0825, + "step": 50550 + }, + { + "epoch": 3.3078181223421654, + "grad_norm": 0.9812949895858765, + "learning_rate": 8.612042129825177e-05, + "loss": 0.0715, + "step": 50560 + }, + { + "epoch": 3.3084723585214264, + "grad_norm": 0.8385498523712158, + "learning_rate": 8.611406891558793e-05, + "loss": 0.0658, + "step": 50570 + }, + { + "epoch": 3.309126594700687, + "grad_norm": 0.820543110370636, + "learning_rate": 8.610771531397957e-05, + "loss": 0.068, + "step": 50580 + }, + { + "epoch": 3.309780830879948, + "grad_norm": 0.8643449544906616, + "learning_rate": 8.61013604936411e-05, + "loss": 0.0748, + "step": 50590 + }, + { + "epoch": 3.3104350670592084, + "grad_norm": 0.8752486705780029, + "learning_rate": 8.609500445478704e-05, + "loss": 0.081, + "step": 50600 + }, + { + "epoch": 3.311089303238469, + "grad_norm": 0.8542597889900208, + "learning_rate": 8.608864719763192e-05, + "loss": 0.0713, + "step": 50610 + }, + { + "epoch": 3.31174353941773, + "grad_norm": 0.7838973999023438, + "learning_rate": 8.608228872239031e-05, + "loss": 0.0758, + "step": 50620 + }, + { + "epoch": 3.3123977755969904, + "grad_norm": 0.9226830005645752, + "learning_rate": 8.607592902927684e-05, + "loss": 0.0815, + "step": 50630 + }, + { + "epoch": 3.3130520117762514, + "grad_norm": 0.9006392955780029, + "learning_rate": 8.606956811850613e-05, + "loss": 0.0697, + "step": 50640 + }, + { + "epoch": 3.313706247955512, + "grad_norm": 0.9188740253448486, + "learning_rate": 8.606320599029292e-05, + "loss": 0.0697, + "step": 50650 + }, + { + "epoch": 3.3143604841347725, + "grad_norm": 0.8508504033088684, + "learning_rate": 8.605684264485192e-05, + "loss": 0.0752, + "step": 50660 + }, + { + "epoch": 3.3150147203140334, + "grad_norm": 0.8384444713592529, + "learning_rate": 8.605047808239791e-05, + "loss": 0.0719, + "step": 50670 + }, + { + "epoch": 3.315668956493294, + "grad_norm": 0.8773384094238281, + "learning_rate": 8.604411230314572e-05, + "loss": 0.0824, + "step": 50680 + }, + { + "epoch": 3.316323192672555, + "grad_norm": 0.7789053320884705, + "learning_rate": 8.603774530731023e-05, + "loss": 0.0652, + "step": 50690 + }, + { + "epoch": 3.3169774288518155, + "grad_norm": 0.7404478788375854, + "learning_rate": 8.60313770951063e-05, + "loss": 0.0704, + "step": 50700 + }, + { + "epoch": 3.317631665031076, + "grad_norm": 1.0024648904800415, + "learning_rate": 8.60250076667489e-05, + "loss": 0.0829, + "step": 50710 + }, + { + "epoch": 3.318285901210337, + "grad_norm": 0.7322053909301758, + "learning_rate": 8.601863702245303e-05, + "loss": 0.0716, + "step": 50720 + }, + { + "epoch": 3.3189401373895975, + "grad_norm": 1.0306543111801147, + "learning_rate": 8.601226516243368e-05, + "loss": 0.0719, + "step": 50730 + }, + { + "epoch": 3.3195943735688584, + "grad_norm": 0.8243963718414307, + "learning_rate": 8.600589208690595e-05, + "loss": 0.0726, + "step": 50740 + }, + { + "epoch": 3.320248609748119, + "grad_norm": 0.7126163244247437, + "learning_rate": 8.599951779608493e-05, + "loss": 0.072, + "step": 50750 + }, + { + "epoch": 3.32090284592738, + "grad_norm": 0.9243032932281494, + "learning_rate": 8.599314229018575e-05, + "loss": 0.0765, + "step": 50760 + }, + { + "epoch": 3.3215570821066405, + "grad_norm": 0.8897315859794617, + "learning_rate": 8.598676556942365e-05, + "loss": 0.0771, + "step": 50770 + }, + { + "epoch": 3.322211318285901, + "grad_norm": 0.8531565070152283, + "learning_rate": 8.598038763401382e-05, + "loss": 0.076, + "step": 50780 + }, + { + "epoch": 3.322865554465162, + "grad_norm": 0.7867346405982971, + "learning_rate": 8.597400848417156e-05, + "loss": 0.0847, + "step": 50790 + }, + { + "epoch": 3.3235197906444225, + "grad_norm": 0.8421074748039246, + "learning_rate": 8.596762812011216e-05, + "loss": 0.0792, + "step": 50800 + }, + { + "epoch": 3.3241740268236835, + "grad_norm": 0.8326323628425598, + "learning_rate": 8.596124654205097e-05, + "loss": 0.0737, + "step": 50810 + }, + { + "epoch": 3.324828263002944, + "grad_norm": 1.0936020612716675, + "learning_rate": 8.595486375020341e-05, + "loss": 0.0802, + "step": 50820 + }, + { + "epoch": 3.325482499182205, + "grad_norm": 1.0565087795257568, + "learning_rate": 8.59484797447849e-05, + "loss": 0.0741, + "step": 50830 + }, + { + "epoch": 3.3261367353614655, + "grad_norm": 0.8278268575668335, + "learning_rate": 8.594209452601092e-05, + "loss": 0.0665, + "step": 50840 + }, + { + "epoch": 3.326790971540726, + "grad_norm": 0.720306932926178, + "learning_rate": 8.593570809409698e-05, + "loss": 0.0753, + "step": 50850 + }, + { + "epoch": 3.327445207719987, + "grad_norm": 0.9338958859443665, + "learning_rate": 8.592932044925866e-05, + "loss": 0.0729, + "step": 50860 + }, + { + "epoch": 3.3280994438992475, + "grad_norm": 0.8932592868804932, + "learning_rate": 8.592293159171155e-05, + "loss": 0.0749, + "step": 50870 + }, + { + "epoch": 3.3287536800785085, + "grad_norm": 0.7628698945045471, + "learning_rate": 8.591654152167128e-05, + "loss": 0.0762, + "step": 50880 + }, + { + "epoch": 3.329407916257769, + "grad_norm": 0.855076789855957, + "learning_rate": 8.591015023935353e-05, + "loss": 0.0792, + "step": 50890 + }, + { + "epoch": 3.33006215243703, + "grad_norm": 0.8656195998191833, + "learning_rate": 8.590375774497406e-05, + "loss": 0.0678, + "step": 50900 + }, + { + "epoch": 3.3307163886162905, + "grad_norm": 1.1251041889190674, + "learning_rate": 8.589736403874858e-05, + "loss": 0.0817, + "step": 50910 + }, + { + "epoch": 3.331370624795551, + "grad_norm": 0.815820038318634, + "learning_rate": 8.589096912089292e-05, + "loss": 0.0795, + "step": 50920 + }, + { + "epoch": 3.332024860974812, + "grad_norm": 0.8184264302253723, + "learning_rate": 8.588457299162293e-05, + "loss": 0.0679, + "step": 50930 + }, + { + "epoch": 3.3326790971540725, + "grad_norm": 0.6833990216255188, + "learning_rate": 8.587817565115449e-05, + "loss": 0.063, + "step": 50940 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.7902950644493103, + "learning_rate": 8.587177709970353e-05, + "loss": 0.0683, + "step": 50950 + }, + { + "epoch": 3.333987569512594, + "grad_norm": 0.8151212334632874, + "learning_rate": 8.586537733748601e-05, + "loss": 0.0757, + "step": 50960 + }, + { + "epoch": 3.334641805691855, + "grad_norm": 0.8784273266792297, + "learning_rate": 8.585897636471796e-05, + "loss": 0.071, + "step": 50970 + }, + { + "epoch": 3.3352960418711155, + "grad_norm": 0.8391785621643066, + "learning_rate": 8.585257418161538e-05, + "loss": 0.0744, + "step": 50980 + }, + { + "epoch": 3.335950278050376, + "grad_norm": 1.0281047821044922, + "learning_rate": 8.584617078839443e-05, + "loss": 0.0855, + "step": 50990 + }, + { + "epoch": 3.336604514229637, + "grad_norm": 0.838431715965271, + "learning_rate": 8.58397661852712e-05, + "loss": 0.0777, + "step": 51000 + }, + { + "epoch": 3.3372587504088975, + "grad_norm": 0.7739060521125793, + "learning_rate": 8.583336037246186e-05, + "loss": 0.067, + "step": 51010 + }, + { + "epoch": 3.3379129865881585, + "grad_norm": 0.9541481733322144, + "learning_rate": 8.582695335018263e-05, + "loss": 0.0777, + "step": 51020 + }, + { + "epoch": 3.338567222767419, + "grad_norm": 0.9411327242851257, + "learning_rate": 8.582054511864977e-05, + "loss": 0.0823, + "step": 51030 + }, + { + "epoch": 3.33922145894668, + "grad_norm": 0.785683274269104, + "learning_rate": 8.581413567807956e-05, + "loss": 0.0685, + "step": 51040 + }, + { + "epoch": 3.3398756951259405, + "grad_norm": 0.7861221432685852, + "learning_rate": 8.580772502868835e-05, + "loss": 0.0741, + "step": 51050 + }, + { + "epoch": 3.340529931305201, + "grad_norm": 0.9560049176216125, + "learning_rate": 8.58013131706925e-05, + "loss": 0.0704, + "step": 51060 + }, + { + "epoch": 3.341184167484462, + "grad_norm": 0.7720146775245667, + "learning_rate": 8.579490010430846e-05, + "loss": 0.069, + "step": 51070 + }, + { + "epoch": 3.3418384036637225, + "grad_norm": 0.7846336364746094, + "learning_rate": 8.578848582975266e-05, + "loss": 0.074, + "step": 51080 + }, + { + "epoch": 3.3424926398429835, + "grad_norm": 0.9714375734329224, + "learning_rate": 8.57820703472416e-05, + "loss": 0.0714, + "step": 51090 + }, + { + "epoch": 3.343146876022244, + "grad_norm": 0.7882207036018372, + "learning_rate": 8.577565365699183e-05, + "loss": 0.0806, + "step": 51100 + }, + { + "epoch": 3.3438011122015046, + "grad_norm": 0.8583884835243225, + "learning_rate": 8.576923575921991e-05, + "loss": 0.076, + "step": 51110 + }, + { + "epoch": 3.3444553483807655, + "grad_norm": 1.0091041326522827, + "learning_rate": 8.576281665414249e-05, + "loss": 0.0838, + "step": 51120 + }, + { + "epoch": 3.345109584560026, + "grad_norm": 0.9896878004074097, + "learning_rate": 8.57563963419762e-05, + "loss": 0.0801, + "step": 51130 + }, + { + "epoch": 3.345763820739287, + "grad_norm": 0.9059678912162781, + "learning_rate": 8.574997482293778e-05, + "loss": 0.0756, + "step": 51140 + }, + { + "epoch": 3.3464180569185475, + "grad_norm": 0.8360334038734436, + "learning_rate": 8.574355209724393e-05, + "loss": 0.0782, + "step": 51150 + }, + { + "epoch": 3.347072293097808, + "grad_norm": 1.1059879064559937, + "learning_rate": 8.573712816511148e-05, + "loss": 0.0771, + "step": 51160 + }, + { + "epoch": 3.347726529277069, + "grad_norm": 0.9057727456092834, + "learning_rate": 8.57307030267572e-05, + "loss": 0.0817, + "step": 51170 + }, + { + "epoch": 3.3483807654563296, + "grad_norm": 0.7811506390571594, + "learning_rate": 8.572427668239802e-05, + "loss": 0.0817, + "step": 51180 + }, + { + "epoch": 3.3490350016355905, + "grad_norm": 0.927189290523529, + "learning_rate": 8.571784913225078e-05, + "loss": 0.0707, + "step": 51190 + }, + { + "epoch": 3.349689237814851, + "grad_norm": 0.7805935740470886, + "learning_rate": 8.571142037653249e-05, + "loss": 0.0763, + "step": 51200 + }, + { + "epoch": 3.350343473994112, + "grad_norm": 0.9339170455932617, + "learning_rate": 8.570499041546007e-05, + "loss": 0.0666, + "step": 51210 + }, + { + "epoch": 3.3509977101733726, + "grad_norm": 0.8374119997024536, + "learning_rate": 8.56985592492506e-05, + "loss": 0.0721, + "step": 51220 + }, + { + "epoch": 3.351651946352633, + "grad_norm": 1.1060223579406738, + "learning_rate": 8.569212687812113e-05, + "loss": 0.0828, + "step": 51230 + }, + { + "epoch": 3.352306182531894, + "grad_norm": 0.7476975917816162, + "learning_rate": 8.568569330228879e-05, + "loss": 0.0794, + "step": 51240 + }, + { + "epoch": 3.3529604187111546, + "grad_norm": 0.9365630149841309, + "learning_rate": 8.56792585219707e-05, + "loss": 0.0771, + "step": 51250 + }, + { + "epoch": 3.3536146548904155, + "grad_norm": 0.8895155787467957, + "learning_rate": 8.567282253738407e-05, + "loss": 0.0798, + "step": 51260 + }, + { + "epoch": 3.354268891069676, + "grad_norm": 0.9719933271408081, + "learning_rate": 8.566638534874612e-05, + "loss": 0.0788, + "step": 51270 + }, + { + "epoch": 3.354923127248937, + "grad_norm": 0.9939890503883362, + "learning_rate": 8.565994695627411e-05, + "loss": 0.0669, + "step": 51280 + }, + { + "epoch": 3.3555773634281976, + "grad_norm": 0.8980028033256531, + "learning_rate": 8.565350736018539e-05, + "loss": 0.0667, + "step": 51290 + }, + { + "epoch": 3.356231599607458, + "grad_norm": 0.8218081593513489, + "learning_rate": 8.564706656069726e-05, + "loss": 0.0762, + "step": 51300 + }, + { + "epoch": 3.356885835786719, + "grad_norm": 0.9311951398849487, + "learning_rate": 8.564062455802718e-05, + "loss": 0.077, + "step": 51310 + }, + { + "epoch": 3.3575400719659796, + "grad_norm": 0.7359833121299744, + "learning_rate": 8.563418135239254e-05, + "loss": 0.0766, + "step": 51320 + }, + { + "epoch": 3.3581943081452406, + "grad_norm": 0.6619185209274292, + "learning_rate": 8.56277369440108e-05, + "loss": 0.0756, + "step": 51330 + }, + { + "epoch": 3.358848544324501, + "grad_norm": 0.9243825078010559, + "learning_rate": 8.562129133309953e-05, + "loss": 0.0796, + "step": 51340 + }, + { + "epoch": 3.359502780503762, + "grad_norm": 0.7941439747810364, + "learning_rate": 8.561484451987626e-05, + "loss": 0.0657, + "step": 51350 + }, + { + "epoch": 3.3601570166830226, + "grad_norm": 1.0141104459762573, + "learning_rate": 8.560839650455857e-05, + "loss": 0.073, + "step": 51360 + }, + { + "epoch": 3.360811252862283, + "grad_norm": 0.7483166456222534, + "learning_rate": 8.560194728736412e-05, + "loss": 0.0696, + "step": 51370 + }, + { + "epoch": 3.361465489041544, + "grad_norm": 0.6753225922584534, + "learning_rate": 8.559549686851057e-05, + "loss": 0.0641, + "step": 51380 + }, + { + "epoch": 3.3621197252208046, + "grad_norm": 0.84939044713974, + "learning_rate": 8.558904524821565e-05, + "loss": 0.0747, + "step": 51390 + }, + { + "epoch": 3.3627739614000656, + "grad_norm": 0.9355828166007996, + "learning_rate": 8.558259242669713e-05, + "loss": 0.0762, + "step": 51400 + }, + { + "epoch": 3.363428197579326, + "grad_norm": 1.0123666524887085, + "learning_rate": 8.557613840417277e-05, + "loss": 0.0796, + "step": 51410 + }, + { + "epoch": 3.364082433758587, + "grad_norm": 0.8886246085166931, + "learning_rate": 8.556968318086047e-05, + "loss": 0.0722, + "step": 51420 + }, + { + "epoch": 3.3647366699378476, + "grad_norm": 0.7857908606529236, + "learning_rate": 8.556322675697806e-05, + "loss": 0.0708, + "step": 51430 + }, + { + "epoch": 3.365390906117108, + "grad_norm": 0.9863922595977783, + "learning_rate": 8.555676913274349e-05, + "loss": 0.0667, + "step": 51440 + }, + { + "epoch": 3.366045142296369, + "grad_norm": 1.3981025218963623, + "learning_rate": 8.55503103083747e-05, + "loss": 0.0731, + "step": 51450 + }, + { + "epoch": 3.3666993784756296, + "grad_norm": 0.8313714265823364, + "learning_rate": 8.55438502840897e-05, + "loss": 0.0696, + "step": 51460 + }, + { + "epoch": 3.3673536146548906, + "grad_norm": 0.7925471067428589, + "learning_rate": 8.553738906010654e-05, + "loss": 0.0685, + "step": 51470 + }, + { + "epoch": 3.368007850834151, + "grad_norm": 0.9922884106636047, + "learning_rate": 8.55309266366433e-05, + "loss": 0.0907, + "step": 51480 + }, + { + "epoch": 3.368662087013412, + "grad_norm": 0.8721675276756287, + "learning_rate": 8.55244630139181e-05, + "loss": 0.0703, + "step": 51490 + }, + { + "epoch": 3.3693163231926726, + "grad_norm": 0.9497895240783691, + "learning_rate": 8.551799819214912e-05, + "loss": 0.0769, + "step": 51500 + }, + { + "epoch": 3.369970559371933, + "grad_norm": 0.9085605144500732, + "learning_rate": 8.551153217155453e-05, + "loss": 0.0754, + "step": 51510 + }, + { + "epoch": 3.370624795551194, + "grad_norm": 0.8060374855995178, + "learning_rate": 8.550506495235262e-05, + "loss": 0.0747, + "step": 51520 + }, + { + "epoch": 3.3712790317304546, + "grad_norm": 0.9739006161689758, + "learning_rate": 8.549859653476164e-05, + "loss": 0.0746, + "step": 51530 + }, + { + "epoch": 3.3719332679097156, + "grad_norm": 0.8732509613037109, + "learning_rate": 8.549212691899993e-05, + "loss": 0.0795, + "step": 51540 + }, + { + "epoch": 3.372587504088976, + "grad_norm": 0.7843785285949707, + "learning_rate": 8.548565610528585e-05, + "loss": 0.0675, + "step": 51550 + }, + { + "epoch": 3.3732417402682366, + "grad_norm": 0.7273816466331482, + "learning_rate": 8.547918409383782e-05, + "loss": 0.0759, + "step": 51560 + }, + { + "epoch": 3.3738959764474976, + "grad_norm": 0.7831087708473206, + "learning_rate": 8.547271088487427e-05, + "loss": 0.0703, + "step": 51570 + }, + { + "epoch": 3.374550212626758, + "grad_norm": 0.7511657476425171, + "learning_rate": 8.54662364786137e-05, + "loss": 0.0706, + "step": 51580 + }, + { + "epoch": 3.375204448806019, + "grad_norm": 0.9097614288330078, + "learning_rate": 8.545976087527463e-05, + "loss": 0.068, + "step": 51590 + }, + { + "epoch": 3.3758586849852796, + "grad_norm": 0.7851161956787109, + "learning_rate": 8.545328407507565e-05, + "loss": 0.074, + "step": 51600 + }, + { + "epoch": 3.37651292116454, + "grad_norm": 0.9306683540344238, + "learning_rate": 8.544680607823534e-05, + "loss": 0.0856, + "step": 51610 + }, + { + "epoch": 3.377167157343801, + "grad_norm": 0.841254472732544, + "learning_rate": 8.544032688497236e-05, + "loss": 0.0771, + "step": 51620 + }, + { + "epoch": 3.3778213935230617, + "grad_norm": 1.2638838291168213, + "learning_rate": 8.543384649550543e-05, + "loss": 0.0771, + "step": 51630 + }, + { + "epoch": 3.3784756297023226, + "grad_norm": 0.802693784236908, + "learning_rate": 8.542736491005322e-05, + "loss": 0.0809, + "step": 51640 + }, + { + "epoch": 3.379129865881583, + "grad_norm": 0.7496511340141296, + "learning_rate": 8.542088212883454e-05, + "loss": 0.0827, + "step": 51650 + }, + { + "epoch": 3.379784102060844, + "grad_norm": 0.8439844250679016, + "learning_rate": 8.541439815206819e-05, + "loss": 0.0753, + "step": 51660 + }, + { + "epoch": 3.3804383382401046, + "grad_norm": 0.7389265894889832, + "learning_rate": 8.540791297997304e-05, + "loss": 0.0754, + "step": 51670 + }, + { + "epoch": 3.381092574419365, + "grad_norm": 0.983440101146698, + "learning_rate": 8.540142661276796e-05, + "loss": 0.067, + "step": 51680 + }, + { + "epoch": 3.381746810598626, + "grad_norm": 0.8338766694068909, + "learning_rate": 8.539493905067189e-05, + "loss": 0.0833, + "step": 51690 + }, + { + "epoch": 3.3824010467778867, + "grad_norm": 0.9599545001983643, + "learning_rate": 8.538845029390378e-05, + "loss": 0.0757, + "step": 51700 + }, + { + "epoch": 3.3830552829571476, + "grad_norm": 0.7370075583457947, + "learning_rate": 8.538196034268268e-05, + "loss": 0.07, + "step": 51710 + }, + { + "epoch": 3.383709519136408, + "grad_norm": 0.8070911765098572, + "learning_rate": 8.537546919722764e-05, + "loss": 0.0717, + "step": 51720 + }, + { + "epoch": 3.384363755315669, + "grad_norm": 0.8971413373947144, + "learning_rate": 8.536897685775772e-05, + "loss": 0.0659, + "step": 51730 + }, + { + "epoch": 3.3850179914949297, + "grad_norm": 0.9960509538650513, + "learning_rate": 8.536248332449207e-05, + "loss": 0.0804, + "step": 51740 + }, + { + "epoch": 3.38567222767419, + "grad_norm": 0.7966985106468201, + "learning_rate": 8.535598859764987e-05, + "loss": 0.0661, + "step": 51750 + }, + { + "epoch": 3.386326463853451, + "grad_norm": 0.9922680854797363, + "learning_rate": 8.534949267745034e-05, + "loss": 0.0726, + "step": 51760 + }, + { + "epoch": 3.3869807000327117, + "grad_norm": 0.8376561403274536, + "learning_rate": 8.534299556411271e-05, + "loss": 0.0679, + "step": 51770 + }, + { + "epoch": 3.3876349362119726, + "grad_norm": 0.915437638759613, + "learning_rate": 8.53364972578563e-05, + "loss": 0.0787, + "step": 51780 + }, + { + "epoch": 3.388289172391233, + "grad_norm": 0.7827186584472656, + "learning_rate": 8.532999775890043e-05, + "loss": 0.0735, + "step": 51790 + }, + { + "epoch": 3.388943408570494, + "grad_norm": 0.7581836581230164, + "learning_rate": 8.532349706746447e-05, + "loss": 0.0699, + "step": 51800 + }, + { + "epoch": 3.3895976447497547, + "grad_norm": 0.7785212397575378, + "learning_rate": 8.531699518376787e-05, + "loss": 0.0668, + "step": 51810 + }, + { + "epoch": 3.390251880929015, + "grad_norm": 0.9209142327308655, + "learning_rate": 8.531049210803003e-05, + "loss": 0.0723, + "step": 51820 + }, + { + "epoch": 3.390906117108276, + "grad_norm": 0.8338945508003235, + "learning_rate": 8.530398784047051e-05, + "loss": 0.081, + "step": 51830 + }, + { + "epoch": 3.3915603532875367, + "grad_norm": 0.8179078102111816, + "learning_rate": 8.529748238130879e-05, + "loss": 0.0715, + "step": 51840 + }, + { + "epoch": 3.3922145894667977, + "grad_norm": 0.7469951510429382, + "learning_rate": 8.529097573076447e-05, + "loss": 0.0812, + "step": 51850 + }, + { + "epoch": 3.392868825646058, + "grad_norm": 0.8803715705871582, + "learning_rate": 8.528446788905718e-05, + "loss": 0.0612, + "step": 51860 + }, + { + "epoch": 3.393523061825319, + "grad_norm": 0.7448040843009949, + "learning_rate": 8.527795885640655e-05, + "loss": 0.0772, + "step": 51870 + }, + { + "epoch": 3.3941772980045797, + "grad_norm": 0.9333431124687195, + "learning_rate": 8.527144863303227e-05, + "loss": 0.0727, + "step": 51880 + }, + { + "epoch": 3.39483153418384, + "grad_norm": 0.8752960562705994, + "learning_rate": 8.526493721915412e-05, + "loss": 0.0793, + "step": 51890 + }, + { + "epoch": 3.395485770363101, + "grad_norm": 0.9031022191047668, + "learning_rate": 8.525842461499185e-05, + "loss": 0.0699, + "step": 51900 + }, + { + "epoch": 3.3961400065423617, + "grad_norm": 0.8795397281646729, + "learning_rate": 8.525191082076527e-05, + "loss": 0.085, + "step": 51910 + }, + { + "epoch": 3.3967942427216227, + "grad_norm": 0.88941890001297, + "learning_rate": 8.524539583669426e-05, + "loss": 0.0742, + "step": 51920 + }, + { + "epoch": 3.397448478900883, + "grad_norm": 0.9229047894477844, + "learning_rate": 8.52388796629987e-05, + "loss": 0.0754, + "step": 51930 + }, + { + "epoch": 3.398102715080144, + "grad_norm": 0.9691248536109924, + "learning_rate": 8.523236229989855e-05, + "loss": 0.0729, + "step": 51940 + }, + { + "epoch": 3.3987569512594047, + "grad_norm": 0.8035882711410522, + "learning_rate": 8.522584374761375e-05, + "loss": 0.0768, + "step": 51950 + }, + { + "epoch": 3.399411187438665, + "grad_norm": 0.7715499997138977, + "learning_rate": 8.521932400636434e-05, + "loss": 0.075, + "step": 51960 + }, + { + "epoch": 3.400065423617926, + "grad_norm": 0.75547194480896, + "learning_rate": 8.52128030763704e-05, + "loss": 0.0622, + "step": 51970 + }, + { + "epoch": 3.4007196597971867, + "grad_norm": 1.1841288805007935, + "learning_rate": 8.520628095785199e-05, + "loss": 0.0721, + "step": 51980 + }, + { + "epoch": 3.4013738959764477, + "grad_norm": 0.8948665261268616, + "learning_rate": 8.519975765102927e-05, + "loss": 0.0674, + "step": 51990 + }, + { + "epoch": 3.402028132155708, + "grad_norm": 0.9164159297943115, + "learning_rate": 8.519323315612242e-05, + "loss": 0.0703, + "step": 52000 + }, + { + "epoch": 3.4026823683349687, + "grad_norm": 0.8155785799026489, + "learning_rate": 8.518670747335165e-05, + "loss": 0.0767, + "step": 52010 + }, + { + "epoch": 3.4033366045142297, + "grad_norm": 0.8073797821998596, + "learning_rate": 8.518018060293722e-05, + "loss": 0.0724, + "step": 52020 + }, + { + "epoch": 3.4039908406934902, + "grad_norm": 0.816716194152832, + "learning_rate": 8.517365254509942e-05, + "loss": 0.0661, + "step": 52030 + }, + { + "epoch": 3.404645076872751, + "grad_norm": 0.8661830425262451, + "learning_rate": 8.516712330005862e-05, + "loss": 0.0731, + "step": 52040 + }, + { + "epoch": 3.4052993130520117, + "grad_norm": 0.8978433012962341, + "learning_rate": 8.516059286803517e-05, + "loss": 0.0668, + "step": 52050 + }, + { + "epoch": 3.4059535492312722, + "grad_norm": 0.892529308795929, + "learning_rate": 8.515406124924949e-05, + "loss": 0.082, + "step": 52060 + }, + { + "epoch": 3.406607785410533, + "grad_norm": 0.6987180709838867, + "learning_rate": 8.514752844392206e-05, + "loss": 0.0676, + "step": 52070 + }, + { + "epoch": 3.4072620215897937, + "grad_norm": 0.8603876829147339, + "learning_rate": 8.514099445227336e-05, + "loss": 0.0658, + "step": 52080 + }, + { + "epoch": 3.4079162577690547, + "grad_norm": 0.7895892858505249, + "learning_rate": 8.513445927452396e-05, + "loss": 0.0637, + "step": 52090 + }, + { + "epoch": 3.4085704939483152, + "grad_norm": 0.8536374568939209, + "learning_rate": 8.51279229108944e-05, + "loss": 0.0765, + "step": 52100 + }, + { + "epoch": 3.409224730127576, + "grad_norm": 1.0041425228118896, + "learning_rate": 8.51213853616053e-05, + "loss": 0.0794, + "step": 52110 + }, + { + "epoch": 3.4098789663068367, + "grad_norm": 0.9329466223716736, + "learning_rate": 8.511484662687737e-05, + "loss": 0.0693, + "step": 52120 + }, + { + "epoch": 3.4105332024860973, + "grad_norm": 0.9463549852371216, + "learning_rate": 8.510830670693124e-05, + "loss": 0.073, + "step": 52130 + }, + { + "epoch": 3.4111874386653582, + "grad_norm": 0.8371836543083191, + "learning_rate": 8.51017656019877e-05, + "loss": 0.0707, + "step": 52140 + }, + { + "epoch": 3.4118416748446188, + "grad_norm": 1.1609052419662476, + "learning_rate": 8.50952233122675e-05, + "loss": 0.0705, + "step": 52150 + }, + { + "epoch": 3.4124959110238797, + "grad_norm": 0.9662759900093079, + "learning_rate": 8.50886798379915e-05, + "loss": 0.0732, + "step": 52160 + }, + { + "epoch": 3.4131501472031402, + "grad_norm": 0.9604242444038391, + "learning_rate": 8.50821351793805e-05, + "loss": 0.0683, + "step": 52170 + }, + { + "epoch": 3.413804383382401, + "grad_norm": 0.8330959677696228, + "learning_rate": 8.507558933665545e-05, + "loss": 0.0712, + "step": 52180 + }, + { + "epoch": 3.4144586195616617, + "grad_norm": 0.7776506543159485, + "learning_rate": 8.506904231003726e-05, + "loss": 0.0555, + "step": 52190 + }, + { + "epoch": 3.4151128557409223, + "grad_norm": 0.8300237059593201, + "learning_rate": 8.506249409974694e-05, + "loss": 0.0837, + "step": 52200 + }, + { + "epoch": 3.4157670919201832, + "grad_norm": 0.8520910143852234, + "learning_rate": 8.505594470600546e-05, + "loss": 0.0696, + "step": 52210 + }, + { + "epoch": 3.4164213280994438, + "grad_norm": 0.8499581217765808, + "learning_rate": 8.504939412903394e-05, + "loss": 0.0756, + "step": 52220 + }, + { + "epoch": 3.4170755642787047, + "grad_norm": 1.0196528434753418, + "learning_rate": 8.504284236905342e-05, + "loss": 0.079, + "step": 52230 + }, + { + "epoch": 3.4177298004579653, + "grad_norm": 1.0071736574172974, + "learning_rate": 8.503628942628508e-05, + "loss": 0.0836, + "step": 52240 + }, + { + "epoch": 3.4183840366372262, + "grad_norm": 0.8221054673194885, + "learning_rate": 8.502973530095008e-05, + "loss": 0.0666, + "step": 52250 + }, + { + "epoch": 3.4190382728164868, + "grad_norm": 0.9382101893424988, + "learning_rate": 8.502317999326965e-05, + "loss": 0.0742, + "step": 52260 + }, + { + "epoch": 3.4196925089957473, + "grad_norm": 0.9391849637031555, + "learning_rate": 8.501662350346505e-05, + "loss": 0.0725, + "step": 52270 + }, + { + "epoch": 3.4203467451750083, + "grad_norm": 0.7886925339698792, + "learning_rate": 8.501006583175757e-05, + "loss": 0.0861, + "step": 52280 + }, + { + "epoch": 3.4210009813542688, + "grad_norm": 0.7476629018783569, + "learning_rate": 8.500350697836855e-05, + "loss": 0.0649, + "step": 52290 + }, + { + "epoch": 3.4216552175335297, + "grad_norm": 1.02727472782135, + "learning_rate": 8.499694694351936e-05, + "loss": 0.0923, + "step": 52300 + }, + { + "epoch": 3.4223094537127903, + "grad_norm": 1.176798939704895, + "learning_rate": 8.499038572743144e-05, + "loss": 0.0644, + "step": 52310 + }, + { + "epoch": 3.4229636898920512, + "grad_norm": 0.8501316905021667, + "learning_rate": 8.498382333032622e-05, + "loss": 0.0703, + "step": 52320 + }, + { + "epoch": 3.4236179260713118, + "grad_norm": 0.7579331994056702, + "learning_rate": 8.497725975242523e-05, + "loss": 0.0661, + "step": 52330 + }, + { + "epoch": 3.4242721622505723, + "grad_norm": 0.8716086149215698, + "learning_rate": 8.497069499394998e-05, + "loss": 0.0781, + "step": 52340 + }, + { + "epoch": 3.4249263984298333, + "grad_norm": 1.071758508682251, + "learning_rate": 8.496412905512207e-05, + "loss": 0.0681, + "step": 52350 + }, + { + "epoch": 3.425580634609094, + "grad_norm": 0.8886385560035706, + "learning_rate": 8.49575619361631e-05, + "loss": 0.0763, + "step": 52360 + }, + { + "epoch": 3.4262348707883548, + "grad_norm": 0.9664177298545837, + "learning_rate": 8.495099363729472e-05, + "loss": 0.0768, + "step": 52370 + }, + { + "epoch": 3.4268891069676153, + "grad_norm": 0.7250670194625854, + "learning_rate": 8.494442415873868e-05, + "loss": 0.0724, + "step": 52380 + }, + { + "epoch": 3.4275433431468763, + "grad_norm": 0.9024300575256348, + "learning_rate": 8.493785350071665e-05, + "loss": 0.0842, + "step": 52390 + }, + { + "epoch": 3.428197579326137, + "grad_norm": 0.9288867712020874, + "learning_rate": 8.493128166345046e-05, + "loss": 0.0703, + "step": 52400 + }, + { + "epoch": 3.4288518155053973, + "grad_norm": 0.7886371612548828, + "learning_rate": 8.492470864716188e-05, + "loss": 0.0677, + "step": 52410 + }, + { + "epoch": 3.4295060516846583, + "grad_norm": 0.9110251665115356, + "learning_rate": 8.491813445207282e-05, + "loss": 0.0747, + "step": 52420 + }, + { + "epoch": 3.430160287863919, + "grad_norm": 0.7221360802650452, + "learning_rate": 8.491155907840511e-05, + "loss": 0.067, + "step": 52430 + }, + { + "epoch": 3.4308145240431798, + "grad_norm": 0.6658129096031189, + "learning_rate": 8.490498252638074e-05, + "loss": 0.0795, + "step": 52440 + }, + { + "epoch": 3.4314687602224403, + "grad_norm": 0.8011776208877563, + "learning_rate": 8.489840479622166e-05, + "loss": 0.068, + "step": 52450 + }, + { + "epoch": 3.432122996401701, + "grad_norm": 0.8690958619117737, + "learning_rate": 8.48918258881499e-05, + "loss": 0.0713, + "step": 52460 + }, + { + "epoch": 3.432777232580962, + "grad_norm": 0.8399226069450378, + "learning_rate": 8.488524580238752e-05, + "loss": 0.0726, + "step": 52470 + }, + { + "epoch": 3.4334314687602223, + "grad_norm": 0.7656417489051819, + "learning_rate": 8.487866453915658e-05, + "loss": 0.0731, + "step": 52480 + }, + { + "epoch": 3.4340857049394833, + "grad_norm": 0.8498892784118652, + "learning_rate": 8.487208209867928e-05, + "loss": 0.0818, + "step": 52490 + }, + { + "epoch": 3.434739941118744, + "grad_norm": 0.9708839058876038, + "learning_rate": 8.48654984811777e-05, + "loss": 0.0719, + "step": 52500 + }, + { + "epoch": 3.4353941772980043, + "grad_norm": 0.9287952184677124, + "learning_rate": 8.485891368687415e-05, + "loss": 0.0764, + "step": 52510 + }, + { + "epoch": 3.4360484134772653, + "grad_norm": 0.7302289605140686, + "learning_rate": 8.485232771599081e-05, + "loss": 0.0717, + "step": 52520 + }, + { + "epoch": 3.436702649656526, + "grad_norm": 0.9905624985694885, + "learning_rate": 8.484574056875003e-05, + "loss": 0.0685, + "step": 52530 + }, + { + "epoch": 3.437356885835787, + "grad_norm": 0.8764323592185974, + "learning_rate": 8.483915224537411e-05, + "loss": 0.0694, + "step": 52540 + }, + { + "epoch": 3.4380111220150473, + "grad_norm": 0.85755854845047, + "learning_rate": 8.483256274608544e-05, + "loss": 0.0723, + "step": 52550 + }, + { + "epoch": 3.4386653581943083, + "grad_norm": 0.8264038562774658, + "learning_rate": 8.482597207110642e-05, + "loss": 0.0667, + "step": 52560 + }, + { + "epoch": 3.439319594373569, + "grad_norm": 0.8556039333343506, + "learning_rate": 8.481938022065951e-05, + "loss": 0.0814, + "step": 52570 + }, + { + "epoch": 3.4399738305528293, + "grad_norm": 0.8458713293075562, + "learning_rate": 8.48127871949672e-05, + "loss": 0.0724, + "step": 52580 + }, + { + "epoch": 3.4406280667320903, + "grad_norm": 0.9362344145774841, + "learning_rate": 8.480619299425202e-05, + "loss": 0.0773, + "step": 52590 + }, + { + "epoch": 3.441282302911351, + "grad_norm": 1.080499291419983, + "learning_rate": 8.479959761873655e-05, + "loss": 0.0651, + "step": 52600 + }, + { + "epoch": 3.441936539090612, + "grad_norm": 1.0057145357131958, + "learning_rate": 8.479300106864338e-05, + "loss": 0.0717, + "step": 52610 + }, + { + "epoch": 3.4425907752698723, + "grad_norm": 0.8773415684700012, + "learning_rate": 8.478640334419519e-05, + "loss": 0.0668, + "step": 52620 + }, + { + "epoch": 3.4432450114491333, + "grad_norm": 0.7790507674217224, + "learning_rate": 8.477980444561465e-05, + "loss": 0.0702, + "step": 52630 + }, + { + "epoch": 3.443899247628394, + "grad_norm": 0.7044256925582886, + "learning_rate": 8.47732043731245e-05, + "loss": 0.0746, + "step": 52640 + }, + { + "epoch": 3.4445534838076544, + "grad_norm": 0.7038286924362183, + "learning_rate": 8.476660312694751e-05, + "loss": 0.0774, + "step": 52650 + }, + { + "epoch": 3.4452077199869153, + "grad_norm": 0.8981152772903442, + "learning_rate": 8.476000070730647e-05, + "loss": 0.0802, + "step": 52660 + }, + { + "epoch": 3.445861956166176, + "grad_norm": 0.8281787633895874, + "learning_rate": 8.475339711442428e-05, + "loss": 0.0639, + "step": 52670 + }, + { + "epoch": 3.446516192345437, + "grad_norm": 0.7848443388938904, + "learning_rate": 8.474679234852377e-05, + "loss": 0.0688, + "step": 52680 + }, + { + "epoch": 3.4471704285246973, + "grad_norm": 0.9246609210968018, + "learning_rate": 8.474018640982789e-05, + "loss": 0.0724, + "step": 52690 + }, + { + "epoch": 3.4478246647039583, + "grad_norm": 0.9001889228820801, + "learning_rate": 8.473357929855958e-05, + "loss": 0.0872, + "step": 52700 + }, + { + "epoch": 3.448478900883219, + "grad_norm": 0.7608310580253601, + "learning_rate": 8.472697101494192e-05, + "loss": 0.076, + "step": 52710 + }, + { + "epoch": 3.4491331370624794, + "grad_norm": 0.9559541940689087, + "learning_rate": 8.472036155919791e-05, + "loss": 0.071, + "step": 52720 + }, + { + "epoch": 3.4497873732417403, + "grad_norm": 0.8626623153686523, + "learning_rate": 8.471375093155061e-05, + "loss": 0.0722, + "step": 52730 + }, + { + "epoch": 3.450441609421001, + "grad_norm": 0.6806735992431641, + "learning_rate": 8.470713913222321e-05, + "loss": 0.0697, + "step": 52740 + }, + { + "epoch": 3.451095845600262, + "grad_norm": 0.9566436409950256, + "learning_rate": 8.470052616143883e-05, + "loss": 0.0716, + "step": 52750 + }, + { + "epoch": 3.4517500817795224, + "grad_norm": 0.8389673829078674, + "learning_rate": 8.469391201942068e-05, + "loss": 0.0661, + "step": 52760 + }, + { + "epoch": 3.4524043179587833, + "grad_norm": 0.8601690530776978, + "learning_rate": 8.468729670639201e-05, + "loss": 0.0764, + "step": 52770 + }, + { + "epoch": 3.453058554138044, + "grad_norm": 0.8300301432609558, + "learning_rate": 8.468068022257611e-05, + "loss": 0.0793, + "step": 52780 + }, + { + "epoch": 3.4537127903173044, + "grad_norm": 0.8958084583282471, + "learning_rate": 8.46740625681963e-05, + "loss": 0.0737, + "step": 52790 + }, + { + "epoch": 3.4543670264965654, + "grad_norm": 0.9533824920654297, + "learning_rate": 8.466744374347593e-05, + "loss": 0.0647, + "step": 52800 + }, + { + "epoch": 3.455021262675826, + "grad_norm": 0.7993425130844116, + "learning_rate": 8.466082374863844e-05, + "loss": 0.0792, + "step": 52810 + }, + { + "epoch": 3.455675498855087, + "grad_norm": 0.8701884746551514, + "learning_rate": 8.465420258390723e-05, + "loss": 0.0681, + "step": 52820 + }, + { + "epoch": 3.4563297350343474, + "grad_norm": 0.8877395987510681, + "learning_rate": 8.464758024950581e-05, + "loss": 0.0706, + "step": 52830 + }, + { + "epoch": 3.4569839712136083, + "grad_norm": 0.9851657152175903, + "learning_rate": 8.464095674565769e-05, + "loss": 0.0668, + "step": 52840 + }, + { + "epoch": 3.457638207392869, + "grad_norm": 1.3369970321655273, + "learning_rate": 8.46343320725864e-05, + "loss": 0.0743, + "step": 52850 + }, + { + "epoch": 3.4582924435721294, + "grad_norm": 1.114278793334961, + "learning_rate": 8.462770623051561e-05, + "loss": 0.0723, + "step": 52860 + }, + { + "epoch": 3.4589466797513904, + "grad_norm": 0.8872862458229065, + "learning_rate": 8.46210792196689e-05, + "loss": 0.0695, + "step": 52870 + }, + { + "epoch": 3.459600915930651, + "grad_norm": 0.9526658654212952, + "learning_rate": 8.461445104026997e-05, + "loss": 0.076, + "step": 52880 + }, + { + "epoch": 3.460255152109912, + "grad_norm": 0.7298262715339661, + "learning_rate": 8.460782169254254e-05, + "loss": 0.0723, + "step": 52890 + }, + { + "epoch": 3.4609093882891724, + "grad_norm": 0.845991313457489, + "learning_rate": 8.460119117671037e-05, + "loss": 0.0694, + "step": 52900 + }, + { + "epoch": 3.4615636244684334, + "grad_norm": 0.8948342800140381, + "learning_rate": 8.459455949299724e-05, + "loss": 0.0685, + "step": 52910 + }, + { + "epoch": 3.462217860647694, + "grad_norm": 0.7942617535591125, + "learning_rate": 8.458792664162702e-05, + "loss": 0.0754, + "step": 52920 + }, + { + "epoch": 3.4628720968269544, + "grad_norm": 0.959294319152832, + "learning_rate": 8.458129262282355e-05, + "loss": 0.0776, + "step": 52930 + }, + { + "epoch": 3.4635263330062154, + "grad_norm": 0.7469987273216248, + "learning_rate": 8.457465743681077e-05, + "loss": 0.0724, + "step": 52940 + }, + { + "epoch": 3.464180569185476, + "grad_norm": 0.8524947166442871, + "learning_rate": 8.456802108381261e-05, + "loss": 0.0758, + "step": 52950 + }, + { + "epoch": 3.4648348053647364, + "grad_norm": 0.8579651713371277, + "learning_rate": 8.45613835640531e-05, + "loss": 0.0835, + "step": 52960 + }, + { + "epoch": 3.4654890415439974, + "grad_norm": 0.9005305767059326, + "learning_rate": 8.455474487775625e-05, + "loss": 0.0741, + "step": 52970 + }, + { + "epoch": 3.466143277723258, + "grad_norm": 0.8470668792724609, + "learning_rate": 8.454810502514614e-05, + "loss": 0.0657, + "step": 52980 + }, + { + "epoch": 3.466797513902519, + "grad_norm": 0.7774618864059448, + "learning_rate": 8.454146400644687e-05, + "loss": 0.0734, + "step": 52990 + }, + { + "epoch": 3.4674517500817794, + "grad_norm": 0.7881149649620056, + "learning_rate": 8.453482182188259e-05, + "loss": 0.0763, + "step": 53000 + }, + { + "epoch": 3.4681059862610404, + "grad_norm": 0.9058419466018677, + "learning_rate": 8.452817847167753e-05, + "loss": 0.0701, + "step": 53010 + }, + { + "epoch": 3.468760222440301, + "grad_norm": 0.927578866481781, + "learning_rate": 8.452153395605587e-05, + "loss": 0.0646, + "step": 53020 + }, + { + "epoch": 3.4694144586195614, + "grad_norm": 0.8536058664321899, + "learning_rate": 8.451488827524192e-05, + "loss": 0.0701, + "step": 53030 + }, + { + "epoch": 3.4700686947988224, + "grad_norm": 0.7442119717597961, + "learning_rate": 8.450824142945997e-05, + "loss": 0.0696, + "step": 53040 + }, + { + "epoch": 3.470722930978083, + "grad_norm": 0.7530878186225891, + "learning_rate": 8.450159341893436e-05, + "loss": 0.0736, + "step": 53050 + }, + { + "epoch": 3.471377167157344, + "grad_norm": 0.8248975276947021, + "learning_rate": 8.449494424388951e-05, + "loss": 0.0658, + "step": 53060 + }, + { + "epoch": 3.4720314033366044, + "grad_norm": 0.973678708076477, + "learning_rate": 8.44882939045498e-05, + "loss": 0.0659, + "step": 53070 + }, + { + "epoch": 3.4726856395158654, + "grad_norm": 0.950872004032135, + "learning_rate": 8.448164240113972e-05, + "loss": 0.0722, + "step": 53080 + }, + { + "epoch": 3.473339875695126, + "grad_norm": 0.7618328332901001, + "learning_rate": 8.447498973388379e-05, + "loss": 0.0724, + "step": 53090 + }, + { + "epoch": 3.4739941118743864, + "grad_norm": 0.7460528612136841, + "learning_rate": 8.446833590300656e-05, + "loss": 0.0718, + "step": 53100 + }, + { + "epoch": 3.4746483480536474, + "grad_norm": 1.07584810256958, + "learning_rate": 8.446168090873257e-05, + "loss": 0.0778, + "step": 53110 + }, + { + "epoch": 3.475302584232908, + "grad_norm": 0.7845546007156372, + "learning_rate": 8.445502475128649e-05, + "loss": 0.0703, + "step": 53120 + }, + { + "epoch": 3.475956820412169, + "grad_norm": 0.9104862809181213, + "learning_rate": 8.444836743089294e-05, + "loss": 0.0711, + "step": 53130 + }, + { + "epoch": 3.4766110565914294, + "grad_norm": 0.8521603941917419, + "learning_rate": 8.444170894777665e-05, + "loss": 0.0751, + "step": 53140 + }, + { + "epoch": 3.4772652927706904, + "grad_norm": 1.0778313875198364, + "learning_rate": 8.443504930216237e-05, + "loss": 0.0772, + "step": 53150 + }, + { + "epoch": 3.477919528949951, + "grad_norm": 1.079459309577942, + "learning_rate": 8.442838849427486e-05, + "loss": 0.086, + "step": 53160 + }, + { + "epoch": 3.4785737651292115, + "grad_norm": 0.8837111592292786, + "learning_rate": 8.442172652433895e-05, + "loss": 0.0755, + "step": 53170 + }, + { + "epoch": 3.4792280013084724, + "grad_norm": 0.906924843788147, + "learning_rate": 8.441506339257949e-05, + "loss": 0.0708, + "step": 53180 + }, + { + "epoch": 3.479882237487733, + "grad_norm": 0.8300266861915588, + "learning_rate": 8.440839909922139e-05, + "loss": 0.0653, + "step": 53190 + }, + { + "epoch": 3.480536473666994, + "grad_norm": 0.9430700540542603, + "learning_rate": 8.440173364448958e-05, + "loss": 0.0642, + "step": 53200 + }, + { + "epoch": 3.4811907098462544, + "grad_norm": 0.9260400533676147, + "learning_rate": 8.439506702860902e-05, + "loss": 0.0705, + "step": 53210 + }, + { + "epoch": 3.4818449460255154, + "grad_norm": 0.8879568576812744, + "learning_rate": 8.438839925180476e-05, + "loss": 0.0817, + "step": 53220 + }, + { + "epoch": 3.482499182204776, + "grad_norm": 0.8787881731987, + "learning_rate": 8.438173031430185e-05, + "loss": 0.072, + "step": 53230 + }, + { + "epoch": 3.4831534183840365, + "grad_norm": 0.9385049343109131, + "learning_rate": 8.437506021632535e-05, + "loss": 0.0777, + "step": 53240 + }, + { + "epoch": 3.4838076545632974, + "grad_norm": 0.8981115818023682, + "learning_rate": 8.436838895810042e-05, + "loss": 0.0648, + "step": 53250 + }, + { + "epoch": 3.484461890742558, + "grad_norm": 0.8167151808738708, + "learning_rate": 8.436171653985223e-05, + "loss": 0.0831, + "step": 53260 + }, + { + "epoch": 3.485116126921819, + "grad_norm": 1.022867202758789, + "learning_rate": 8.4355042961806e-05, + "loss": 0.0701, + "step": 53270 + }, + { + "epoch": 3.4857703631010795, + "grad_norm": 0.6986026167869568, + "learning_rate": 8.434836822418697e-05, + "loss": 0.0672, + "step": 53280 + }, + { + "epoch": 3.4864245992803404, + "grad_norm": 0.8744747042655945, + "learning_rate": 8.434169232722043e-05, + "loss": 0.0703, + "step": 53290 + }, + { + "epoch": 3.487078835459601, + "grad_norm": 1.0003246068954468, + "learning_rate": 8.433501527113169e-05, + "loss": 0.0639, + "step": 53300 + }, + { + "epoch": 3.4877330716388615, + "grad_norm": 0.8735449314117432, + "learning_rate": 8.432833705614616e-05, + "loss": 0.0683, + "step": 53310 + }, + { + "epoch": 3.4883873078181225, + "grad_norm": 0.8451821804046631, + "learning_rate": 8.43216576824892e-05, + "loss": 0.0666, + "step": 53320 + }, + { + "epoch": 3.489041543997383, + "grad_norm": 0.8688971996307373, + "learning_rate": 8.43149771503863e-05, + "loss": 0.0644, + "step": 53330 + }, + { + "epoch": 3.489695780176644, + "grad_norm": 0.7318007349967957, + "learning_rate": 8.430829546006293e-05, + "loss": 0.0689, + "step": 53340 + }, + { + "epoch": 3.4903500163559045, + "grad_norm": 0.8060617446899414, + "learning_rate": 8.430161261174461e-05, + "loss": 0.0713, + "step": 53350 + }, + { + "epoch": 3.4910042525351654, + "grad_norm": 0.8988872170448303, + "learning_rate": 8.42949286056569e-05, + "loss": 0.0741, + "step": 53360 + }, + { + "epoch": 3.491658488714426, + "grad_norm": 1.0134598016738892, + "learning_rate": 8.42882434420254e-05, + "loss": 0.0698, + "step": 53370 + }, + { + "epoch": 3.4923127248936865, + "grad_norm": 0.8395692706108093, + "learning_rate": 8.428155712107577e-05, + "loss": 0.064, + "step": 53380 + }, + { + "epoch": 3.4929669610729475, + "grad_norm": 0.8911391496658325, + "learning_rate": 8.427486964303368e-05, + "loss": 0.0741, + "step": 53390 + }, + { + "epoch": 3.493621197252208, + "grad_norm": 0.7914614081382751, + "learning_rate": 8.426818100812486e-05, + "loss": 0.073, + "step": 53400 + }, + { + "epoch": 3.4942754334314685, + "grad_norm": 0.8505756855010986, + "learning_rate": 8.426149121657504e-05, + "loss": 0.0766, + "step": 53410 + }, + { + "epoch": 3.4949296696107295, + "grad_norm": 0.6521218419075012, + "learning_rate": 8.425480026861006e-05, + "loss": 0.0668, + "step": 53420 + }, + { + "epoch": 3.49558390578999, + "grad_norm": 0.7567178010940552, + "learning_rate": 8.424810816445571e-05, + "loss": 0.0648, + "step": 53430 + }, + { + "epoch": 3.496238141969251, + "grad_norm": 0.8685494065284729, + "learning_rate": 8.42414149043379e-05, + "loss": 0.0749, + "step": 53440 + }, + { + "epoch": 3.4968923781485115, + "grad_norm": 0.8376366496086121, + "learning_rate": 8.423472048848254e-05, + "loss": 0.0722, + "step": 53450 + }, + { + "epoch": 3.4975466143277725, + "grad_norm": 0.8271428346633911, + "learning_rate": 8.422802491711557e-05, + "loss": 0.0799, + "step": 53460 + }, + { + "epoch": 3.498200850507033, + "grad_norm": 0.9023700952529907, + "learning_rate": 8.4221328190463e-05, + "loss": 0.0633, + "step": 53470 + }, + { + "epoch": 3.4988550866862935, + "grad_norm": 0.8496003150939941, + "learning_rate": 8.421463030875085e-05, + "loss": 0.0744, + "step": 53480 + }, + { + "epoch": 3.4995093228655545, + "grad_norm": 0.7680093050003052, + "learning_rate": 8.420793127220521e-05, + "loss": 0.0715, + "step": 53490 + }, + { + "epoch": 3.500163559044815, + "grad_norm": 0.992078423500061, + "learning_rate": 8.420123108105215e-05, + "loss": 0.0807, + "step": 53500 + }, + { + "epoch": 3.500817795224076, + "grad_norm": 0.7784678936004639, + "learning_rate": 8.419452973551786e-05, + "loss": 0.0692, + "step": 53510 + }, + { + "epoch": 3.5014720314033365, + "grad_norm": 0.8641963601112366, + "learning_rate": 8.418782723582852e-05, + "loss": 0.0731, + "step": 53520 + }, + { + "epoch": 3.5021262675825975, + "grad_norm": 0.8115648627281189, + "learning_rate": 8.418112358221036e-05, + "loss": 0.076, + "step": 53530 + }, + { + "epoch": 3.502780503761858, + "grad_norm": 0.9229583144187927, + "learning_rate": 8.417441877488961e-05, + "loss": 0.0721, + "step": 53540 + }, + { + "epoch": 3.5034347399411185, + "grad_norm": 0.9231205582618713, + "learning_rate": 8.416771281409262e-05, + "loss": 0.0679, + "step": 53550 + }, + { + "epoch": 3.5040889761203795, + "grad_norm": 0.8276332020759583, + "learning_rate": 8.41610057000457e-05, + "loss": 0.0737, + "step": 53560 + }, + { + "epoch": 3.50474321229964, + "grad_norm": 0.75644451379776, + "learning_rate": 8.415429743297524e-05, + "loss": 0.0627, + "step": 53570 + }, + { + "epoch": 3.505397448478901, + "grad_norm": 0.7527329325675964, + "learning_rate": 8.41475880131077e-05, + "loss": 0.0754, + "step": 53580 + }, + { + "epoch": 3.5060516846581615, + "grad_norm": 0.8161507248878479, + "learning_rate": 8.414087744066947e-05, + "loss": 0.065, + "step": 53590 + }, + { + "epoch": 3.5067059208374225, + "grad_norm": 0.851140558719635, + "learning_rate": 8.413416571588713e-05, + "loss": 0.0655, + "step": 53600 + }, + { + "epoch": 3.507360157016683, + "grad_norm": 0.8948183059692383, + "learning_rate": 8.412745283898714e-05, + "loss": 0.0698, + "step": 53610 + }, + { + "epoch": 3.5080143931959435, + "grad_norm": 1.1371095180511475, + "learning_rate": 8.412073881019613e-05, + "loss": 0.0762, + "step": 53620 + }, + { + "epoch": 3.5086686293752045, + "grad_norm": 0.9154216647148132, + "learning_rate": 8.41140236297407e-05, + "loss": 0.0722, + "step": 53630 + }, + { + "epoch": 3.509322865554465, + "grad_norm": 0.8594346046447754, + "learning_rate": 8.41073072978475e-05, + "loss": 0.0718, + "step": 53640 + }, + { + "epoch": 3.509977101733726, + "grad_norm": 0.9689813852310181, + "learning_rate": 8.410058981474324e-05, + "loss": 0.0693, + "step": 53650 + }, + { + "epoch": 3.5106313379129865, + "grad_norm": 0.80068039894104, + "learning_rate": 8.409387118065464e-05, + "loss": 0.0758, + "step": 53660 + }, + { + "epoch": 3.5112855740922475, + "grad_norm": 0.8130351901054382, + "learning_rate": 8.408715139580846e-05, + "loss": 0.0785, + "step": 53670 + }, + { + "epoch": 3.511939810271508, + "grad_norm": 0.758581817150116, + "learning_rate": 8.408043046043154e-05, + "loss": 0.0636, + "step": 53680 + }, + { + "epoch": 3.5125940464507686, + "grad_norm": 0.955032467842102, + "learning_rate": 8.407370837475071e-05, + "loss": 0.0794, + "step": 53690 + }, + { + "epoch": 3.5132482826300295, + "grad_norm": 0.7809516191482544, + "learning_rate": 8.406698513899285e-05, + "loss": 0.0668, + "step": 53700 + }, + { + "epoch": 3.51390251880929, + "grad_norm": 0.8768904805183411, + "learning_rate": 8.406026075338489e-05, + "loss": 0.077, + "step": 53710 + }, + { + "epoch": 3.514556754988551, + "grad_norm": 1.029979944229126, + "learning_rate": 8.405353521815382e-05, + "loss": 0.0761, + "step": 53720 + }, + { + "epoch": 3.5152109911678115, + "grad_norm": 0.7512595057487488, + "learning_rate": 8.404680853352662e-05, + "loss": 0.0804, + "step": 53730 + }, + { + "epoch": 3.5158652273470725, + "grad_norm": 0.8705400824546814, + "learning_rate": 8.404008069973035e-05, + "loss": 0.0709, + "step": 53740 + }, + { + "epoch": 3.516519463526333, + "grad_norm": 0.9716213941574097, + "learning_rate": 8.403335171699209e-05, + "loss": 0.0821, + "step": 53750 + }, + { + "epoch": 3.5171736997055936, + "grad_norm": 0.7278077006340027, + "learning_rate": 8.402662158553894e-05, + "loss": 0.0783, + "step": 53760 + }, + { + "epoch": 3.5178279358848545, + "grad_norm": 0.8177416324615479, + "learning_rate": 8.401989030559807e-05, + "loss": 0.0697, + "step": 53770 + }, + { + "epoch": 3.518482172064115, + "grad_norm": 0.7998149991035461, + "learning_rate": 8.401315787739667e-05, + "loss": 0.0764, + "step": 53780 + }, + { + "epoch": 3.5191364082433756, + "grad_norm": 0.9496130347251892, + "learning_rate": 8.400642430116203e-05, + "loss": 0.0741, + "step": 53790 + }, + { + "epoch": 3.5197906444226366, + "grad_norm": 0.8604884743690491, + "learning_rate": 8.399968957712135e-05, + "loss": 0.0614, + "step": 53800 + }, + { + "epoch": 3.5204448806018975, + "grad_norm": 1.4663547277450562, + "learning_rate": 8.3992953705502e-05, + "loss": 0.0646, + "step": 53810 + }, + { + "epoch": 3.521099116781158, + "grad_norm": 0.8717848658561707, + "learning_rate": 8.39862166865313e-05, + "loss": 0.0814, + "step": 53820 + }, + { + "epoch": 3.5217533529604186, + "grad_norm": 0.8972777724266052, + "learning_rate": 8.397947852043666e-05, + "loss": 0.0727, + "step": 53830 + }, + { + "epoch": 3.5224075891396796, + "grad_norm": 1.0742758512496948, + "learning_rate": 8.39727392074455e-05, + "loss": 0.0723, + "step": 53840 + }, + { + "epoch": 3.52306182531894, + "grad_norm": 0.9516122937202454, + "learning_rate": 8.396599874778531e-05, + "loss": 0.0652, + "step": 53850 + }, + { + "epoch": 3.5237160614982006, + "grad_norm": 0.8731452226638794, + "learning_rate": 8.395925714168356e-05, + "loss": 0.0755, + "step": 53860 + }, + { + "epoch": 3.5243702976774616, + "grad_norm": 0.7560011148452759, + "learning_rate": 8.395251438936784e-05, + "loss": 0.0697, + "step": 53870 + }, + { + "epoch": 3.5250245338567225, + "grad_norm": 0.9268012642860413, + "learning_rate": 8.39457704910657e-05, + "loss": 0.0666, + "step": 53880 + }, + { + "epoch": 3.525678770035983, + "grad_norm": 0.8395716547966003, + "learning_rate": 8.393902544700478e-05, + "loss": 0.0705, + "step": 53890 + }, + { + "epoch": 3.5263330062152436, + "grad_norm": 0.9042119979858398, + "learning_rate": 8.393227925741276e-05, + "loss": 0.0776, + "step": 53900 + }, + { + "epoch": 3.5269872423945046, + "grad_norm": 0.8316322565078735, + "learning_rate": 8.392553192251731e-05, + "loss": 0.0697, + "step": 53910 + }, + { + "epoch": 3.527641478573765, + "grad_norm": 0.9326664209365845, + "learning_rate": 8.391878344254618e-05, + "loss": 0.0822, + "step": 53920 + }, + { + "epoch": 3.5282957147530256, + "grad_norm": 0.7888161540031433, + "learning_rate": 8.391203381772716e-05, + "loss": 0.071, + "step": 53930 + }, + { + "epoch": 3.5289499509322866, + "grad_norm": 1.0052454471588135, + "learning_rate": 8.390528304828807e-05, + "loss": 0.0663, + "step": 53940 + }, + { + "epoch": 3.529604187111547, + "grad_norm": 0.800828754901886, + "learning_rate": 8.389853113445676e-05, + "loss": 0.069, + "step": 53950 + }, + { + "epoch": 3.530258423290808, + "grad_norm": 0.9678196310997009, + "learning_rate": 8.38917780764611e-05, + "loss": 0.0734, + "step": 53960 + }, + { + "epoch": 3.5309126594700686, + "grad_norm": 0.9162442088127136, + "learning_rate": 8.388502387452906e-05, + "loss": 0.0679, + "step": 53970 + }, + { + "epoch": 3.5315668956493296, + "grad_norm": 0.7903563380241394, + "learning_rate": 8.38782685288886e-05, + "loss": 0.0633, + "step": 53980 + }, + { + "epoch": 3.53222113182859, + "grad_norm": 0.8106311559677124, + "learning_rate": 8.387151203976772e-05, + "loss": 0.0761, + "step": 53990 + }, + { + "epoch": 3.5328753680078506, + "grad_norm": 0.8316931128501892, + "learning_rate": 8.386475440739447e-05, + "loss": 0.0641, + "step": 54000 + }, + { + "epoch": 3.5335296041871116, + "grad_norm": 0.8786323666572571, + "learning_rate": 8.385799563199697e-05, + "loss": 0.0891, + "step": 54010 + }, + { + "epoch": 3.534183840366372, + "grad_norm": 0.9065492749214172, + "learning_rate": 8.385123571380331e-05, + "loss": 0.0743, + "step": 54020 + }, + { + "epoch": 3.534838076545633, + "grad_norm": 0.744517982006073, + "learning_rate": 8.384447465304166e-05, + "loss": 0.0679, + "step": 54030 + }, + { + "epoch": 3.5354923127248936, + "grad_norm": 0.8627418875694275, + "learning_rate": 8.383771244994023e-05, + "loss": 0.0755, + "step": 54040 + }, + { + "epoch": 3.5361465489041546, + "grad_norm": 0.8611176609992981, + "learning_rate": 8.383094910472728e-05, + "loss": 0.0785, + "step": 54050 + }, + { + "epoch": 3.536800785083415, + "grad_norm": 0.9356468319892883, + "learning_rate": 8.382418461763105e-05, + "loss": 0.0814, + "step": 54060 + }, + { + "epoch": 3.5374550212626756, + "grad_norm": 0.8152367472648621, + "learning_rate": 8.381741898887989e-05, + "loss": 0.0728, + "step": 54070 + }, + { + "epoch": 3.5381092574419366, + "grad_norm": 0.937050998210907, + "learning_rate": 8.381065221870214e-05, + "loss": 0.0698, + "step": 54080 + }, + { + "epoch": 3.538763493621197, + "grad_norm": 0.8761103749275208, + "learning_rate": 8.380388430732623e-05, + "loss": 0.0731, + "step": 54090 + }, + { + "epoch": 3.539417729800458, + "grad_norm": 0.7785167694091797, + "learning_rate": 8.379711525498055e-05, + "loss": 0.0663, + "step": 54100 + }, + { + "epoch": 3.5400719659797186, + "grad_norm": 0.7450352907180786, + "learning_rate": 8.37903450618936e-05, + "loss": 0.0678, + "step": 54110 + }, + { + "epoch": 3.5407262021589796, + "grad_norm": 0.8352435231208801, + "learning_rate": 8.378357372829391e-05, + "loss": 0.0692, + "step": 54120 + }, + { + "epoch": 3.54138043833824, + "grad_norm": 1.016081690788269, + "learning_rate": 8.377680125440997e-05, + "loss": 0.0641, + "step": 54130 + }, + { + "epoch": 3.5420346745175006, + "grad_norm": 0.9595414400100708, + "learning_rate": 8.377002764047042e-05, + "loss": 0.0721, + "step": 54140 + }, + { + "epoch": 3.5426889106967616, + "grad_norm": 1.0154024362564087, + "learning_rate": 8.376325288670386e-05, + "loss": 0.0712, + "step": 54150 + }, + { + "epoch": 3.543343146876022, + "grad_norm": 0.8880914449691772, + "learning_rate": 8.3756476993339e-05, + "loss": 0.0839, + "step": 54160 + }, + { + "epoch": 3.543997383055283, + "grad_norm": 1.0185810327529907, + "learning_rate": 8.374969996060447e-05, + "loss": 0.077, + "step": 54170 + }, + { + "epoch": 3.5446516192345436, + "grad_norm": 0.900940477848053, + "learning_rate": 8.374292178872907e-05, + "loss": 0.0718, + "step": 54180 + }, + { + "epoch": 3.5453058554138046, + "grad_norm": 0.9907526969909668, + "learning_rate": 8.373614247794157e-05, + "loss": 0.0685, + "step": 54190 + }, + { + "epoch": 3.545960091593065, + "grad_norm": 0.9706956148147583, + "learning_rate": 8.37293620284708e-05, + "loss": 0.0699, + "step": 54200 + }, + { + "epoch": 3.5466143277723257, + "grad_norm": 0.8409591913223267, + "learning_rate": 8.372258044054559e-05, + "loss": 0.0759, + "step": 54210 + }, + { + "epoch": 3.5472685639515866, + "grad_norm": 0.7982200980186462, + "learning_rate": 8.371579771439483e-05, + "loss": 0.0678, + "step": 54220 + }, + { + "epoch": 3.547922800130847, + "grad_norm": 0.8349514603614807, + "learning_rate": 8.37090138502475e-05, + "loss": 0.0714, + "step": 54230 + }, + { + "epoch": 3.5485770363101077, + "grad_norm": 0.7618018984794617, + "learning_rate": 8.370222884833254e-05, + "loss": 0.0592, + "step": 54240 + }, + { + "epoch": 3.5492312724893686, + "grad_norm": 1.4026459455490112, + "learning_rate": 8.369544270887897e-05, + "loss": 0.0804, + "step": 54250 + }, + { + "epoch": 3.5498855086686296, + "grad_norm": 0.8770639896392822, + "learning_rate": 8.368865543211584e-05, + "loss": 0.0781, + "step": 54260 + }, + { + "epoch": 3.55053974484789, + "grad_norm": 1.1316838264465332, + "learning_rate": 8.368186701827223e-05, + "loss": 0.0718, + "step": 54270 + }, + { + "epoch": 3.5511939810271507, + "grad_norm": 0.9863431453704834, + "learning_rate": 8.367507746757728e-05, + "loss": 0.0842, + "step": 54280 + }, + { + "epoch": 3.5518482172064116, + "grad_norm": 0.7806870341300964, + "learning_rate": 8.366828678026016e-05, + "loss": 0.0717, + "step": 54290 + }, + { + "epoch": 3.552502453385672, + "grad_norm": 0.8638490438461304, + "learning_rate": 8.366149495655004e-05, + "loss": 0.073, + "step": 54300 + }, + { + "epoch": 3.5531566895649327, + "grad_norm": 0.9356234669685364, + "learning_rate": 8.36547019966762e-05, + "loss": 0.0662, + "step": 54310 + }, + { + "epoch": 3.5538109257441937, + "grad_norm": 0.7376917600631714, + "learning_rate": 8.36479079008679e-05, + "loss": 0.0667, + "step": 54320 + }, + { + "epoch": 3.5544651619234546, + "grad_norm": 0.7717798948287964, + "learning_rate": 8.364111266935446e-05, + "loss": 0.0645, + "step": 54330 + }, + { + "epoch": 3.555119398102715, + "grad_norm": 0.8551239371299744, + "learning_rate": 8.363431630236525e-05, + "loss": 0.0726, + "step": 54340 + }, + { + "epoch": 3.5557736342819757, + "grad_norm": 0.7663902640342712, + "learning_rate": 8.362751880012965e-05, + "loss": 0.076, + "step": 54350 + }, + { + "epoch": 3.5564278704612367, + "grad_norm": 1.0010058879852295, + "learning_rate": 8.362072016287709e-05, + "loss": 0.0705, + "step": 54360 + }, + { + "epoch": 3.557082106640497, + "grad_norm": 0.7997660040855408, + "learning_rate": 8.361392039083706e-05, + "loss": 0.0669, + "step": 54370 + }, + { + "epoch": 3.5577363428197577, + "grad_norm": 0.6869263648986816, + "learning_rate": 8.360711948423906e-05, + "loss": 0.0673, + "step": 54380 + }, + { + "epoch": 3.5583905789990187, + "grad_norm": 1.1054303646087646, + "learning_rate": 8.360031744331264e-05, + "loss": 0.0772, + "step": 54390 + }, + { + "epoch": 3.559044815178279, + "grad_norm": 0.9529674649238586, + "learning_rate": 8.359351426828739e-05, + "loss": 0.067, + "step": 54400 + }, + { + "epoch": 3.55969905135754, + "grad_norm": 0.6143543124198914, + "learning_rate": 8.358670995939293e-05, + "loss": 0.0758, + "step": 54410 + }, + { + "epoch": 3.5603532875368007, + "grad_norm": 0.9302859306335449, + "learning_rate": 8.357990451685892e-05, + "loss": 0.0653, + "step": 54420 + }, + { + "epoch": 3.5610075237160617, + "grad_norm": 0.8283950090408325, + "learning_rate": 8.357309794091507e-05, + "loss": 0.0701, + "step": 54430 + }, + { + "epoch": 3.561661759895322, + "grad_norm": 0.7890059947967529, + "learning_rate": 8.356629023179111e-05, + "loss": 0.0711, + "step": 54440 + }, + { + "epoch": 3.5623159960745827, + "grad_norm": 0.8997254371643066, + "learning_rate": 8.355948138971683e-05, + "loss": 0.0659, + "step": 54450 + }, + { + "epoch": 3.5629702322538437, + "grad_norm": 1.2762157917022705, + "learning_rate": 8.355267141492205e-05, + "loss": 0.0692, + "step": 54460 + }, + { + "epoch": 3.563624468433104, + "grad_norm": 0.8240480422973633, + "learning_rate": 8.354586030763659e-05, + "loss": 0.064, + "step": 54470 + }, + { + "epoch": 3.564278704612365, + "grad_norm": 0.8595862984657288, + "learning_rate": 8.353904806809039e-05, + "loss": 0.0612, + "step": 54480 + }, + { + "epoch": 3.5649329407916257, + "grad_norm": 1.0054336786270142, + "learning_rate": 8.353223469651335e-05, + "loss": 0.075, + "step": 54490 + }, + { + "epoch": 3.5655871769708867, + "grad_norm": 0.9695059061050415, + "learning_rate": 8.352542019313544e-05, + "loss": 0.0723, + "step": 54500 + }, + { + "epoch": 3.566241413150147, + "grad_norm": 0.8478056192398071, + "learning_rate": 8.351860455818667e-05, + "loss": 0.0661, + "step": 54510 + }, + { + "epoch": 3.5668956493294077, + "grad_norm": 0.7672955989837646, + "learning_rate": 8.35117877918971e-05, + "loss": 0.0677, + "step": 54520 + }, + { + "epoch": 3.5675498855086687, + "grad_norm": 0.8970052003860474, + "learning_rate": 8.350496989449681e-05, + "loss": 0.0616, + "step": 54530 + }, + { + "epoch": 3.568204121687929, + "grad_norm": 0.7992386221885681, + "learning_rate": 8.34981508662159e-05, + "loss": 0.0715, + "step": 54540 + }, + { + "epoch": 3.56885835786719, + "grad_norm": 0.8448268175125122, + "learning_rate": 8.349133070728456e-05, + "loss": 0.0611, + "step": 54550 + }, + { + "epoch": 3.5695125940464507, + "grad_norm": 0.821503758430481, + "learning_rate": 8.348450941793298e-05, + "loss": 0.0661, + "step": 54560 + }, + { + "epoch": 3.5701668302257117, + "grad_norm": 0.9001857042312622, + "learning_rate": 8.347768699839139e-05, + "loss": 0.0688, + "step": 54570 + }, + { + "epoch": 3.570821066404972, + "grad_norm": 0.891309380531311, + "learning_rate": 8.347086344889006e-05, + "loss": 0.0628, + "step": 54580 + }, + { + "epoch": 3.5714753025842327, + "grad_norm": 0.8133125305175781, + "learning_rate": 8.34640387696593e-05, + "loss": 0.0648, + "step": 54590 + }, + { + "epoch": 3.5721295387634937, + "grad_norm": 1.0169897079467773, + "learning_rate": 8.345721296092947e-05, + "loss": 0.0794, + "step": 54600 + }, + { + "epoch": 3.5727837749427542, + "grad_norm": 0.8700054883956909, + "learning_rate": 8.345038602293097e-05, + "loss": 0.0629, + "step": 54610 + }, + { + "epoch": 3.573438011122015, + "grad_norm": 0.9666095972061157, + "learning_rate": 8.344355795589421e-05, + "loss": 0.0728, + "step": 54620 + }, + { + "epoch": 3.5740922473012757, + "grad_norm": 0.785966157913208, + "learning_rate": 8.343672876004965e-05, + "loss": 0.0687, + "step": 54630 + }, + { + "epoch": 3.5747464834805367, + "grad_norm": 0.988416314125061, + "learning_rate": 8.342989843562782e-05, + "loss": 0.0689, + "step": 54640 + }, + { + "epoch": 3.575400719659797, + "grad_norm": 0.9573351740837097, + "learning_rate": 8.342306698285923e-05, + "loss": 0.0716, + "step": 54650 + }, + { + "epoch": 3.5760549558390577, + "grad_norm": 0.8525318503379822, + "learning_rate": 8.341623440197448e-05, + "loss": 0.0748, + "step": 54660 + }, + { + "epoch": 3.5767091920183187, + "grad_norm": 1.0283527374267578, + "learning_rate": 8.340940069320418e-05, + "loss": 0.0686, + "step": 54670 + }, + { + "epoch": 3.5773634281975792, + "grad_norm": 0.908446192741394, + "learning_rate": 8.3402565856779e-05, + "loss": 0.0638, + "step": 54680 + }, + { + "epoch": 3.5780176643768398, + "grad_norm": 0.984634518623352, + "learning_rate": 8.339572989292961e-05, + "loss": 0.0744, + "step": 54690 + }, + { + "epoch": 3.5786719005561007, + "grad_norm": 1.0757420063018799, + "learning_rate": 8.338889280188674e-05, + "loss": 0.0748, + "step": 54700 + }, + { + "epoch": 3.5793261367353617, + "grad_norm": 0.6891494393348694, + "learning_rate": 8.338205458388118e-05, + "loss": 0.0737, + "step": 54710 + }, + { + "epoch": 3.5799803729146222, + "grad_norm": 0.9240961074829102, + "learning_rate": 8.337521523914375e-05, + "loss": 0.0848, + "step": 54720 + }, + { + "epoch": 3.5806346090938828, + "grad_norm": 0.8011013269424438, + "learning_rate": 8.336837476790526e-05, + "loss": 0.063, + "step": 54730 + }, + { + "epoch": 3.5812888452731437, + "grad_norm": 0.9099512696266174, + "learning_rate": 8.336153317039662e-05, + "loss": 0.0673, + "step": 54740 + }, + { + "epoch": 3.5819430814524043, + "grad_norm": 0.8403385281562805, + "learning_rate": 8.335469044684872e-05, + "loss": 0.072, + "step": 54750 + }, + { + "epoch": 3.582597317631665, + "grad_norm": 0.8340772986412048, + "learning_rate": 8.334784659749255e-05, + "loss": 0.0665, + "step": 54760 + }, + { + "epoch": 3.5832515538109257, + "grad_norm": 0.8868764042854309, + "learning_rate": 8.334100162255912e-05, + "loss": 0.0762, + "step": 54770 + }, + { + "epoch": 3.5839057899901867, + "grad_norm": 0.8480395078659058, + "learning_rate": 8.33341555222794e-05, + "loss": 0.0677, + "step": 54780 + }, + { + "epoch": 3.5845600261694472, + "grad_norm": 1.0152699947357178, + "learning_rate": 8.332730829688456e-05, + "loss": 0.0769, + "step": 54790 + }, + { + "epoch": 3.5852142623487078, + "grad_norm": 1.082312822341919, + "learning_rate": 8.332045994660563e-05, + "loss": 0.0708, + "step": 54800 + }, + { + "epoch": 3.5858684985279687, + "grad_norm": 0.8725629448890686, + "learning_rate": 8.33136104716738e-05, + "loss": 0.0608, + "step": 54810 + }, + { + "epoch": 3.5865227347072293, + "grad_norm": 0.7891134023666382, + "learning_rate": 8.330675987232024e-05, + "loss": 0.0615, + "step": 54820 + }, + { + "epoch": 3.58717697088649, + "grad_norm": 1.0689077377319336, + "learning_rate": 8.32999081487762e-05, + "loss": 0.0745, + "step": 54830 + }, + { + "epoch": 3.5878312070657508, + "grad_norm": 0.77810138463974, + "learning_rate": 8.329305530127291e-05, + "loss": 0.0666, + "step": 54840 + }, + { + "epoch": 3.5884854432450113, + "grad_norm": 1.0314850807189941, + "learning_rate": 8.32862013300417e-05, + "loss": 0.0775, + "step": 54850 + }, + { + "epoch": 3.5891396794242723, + "grad_norm": 0.9134453535079956, + "learning_rate": 8.32793462353139e-05, + "loss": 0.0732, + "step": 54860 + }, + { + "epoch": 3.589793915603533, + "grad_norm": 0.786392331123352, + "learning_rate": 8.32724900173209e-05, + "loss": 0.0733, + "step": 54870 + }, + { + "epoch": 3.5904481517827938, + "grad_norm": 0.7125436663627625, + "learning_rate": 8.326563267629408e-05, + "loss": 0.0712, + "step": 54880 + }, + { + "epoch": 3.5911023879620543, + "grad_norm": 0.9112322330474854, + "learning_rate": 8.325877421246491e-05, + "loss": 0.0807, + "step": 54890 + }, + { + "epoch": 3.591756624141315, + "grad_norm": 0.8013171553611755, + "learning_rate": 8.325191462606491e-05, + "loss": 0.0722, + "step": 54900 + }, + { + "epoch": 3.5924108603205758, + "grad_norm": 0.7827738523483276, + "learning_rate": 8.324505391732557e-05, + "loss": 0.0701, + "step": 54910 + }, + { + "epoch": 3.5930650964998363, + "grad_norm": 0.8412222266197205, + "learning_rate": 8.323819208647847e-05, + "loss": 0.0771, + "step": 54920 + }, + { + "epoch": 3.5937193326790973, + "grad_norm": 0.9452791213989258, + "learning_rate": 8.323132913375522e-05, + "loss": 0.0665, + "step": 54930 + }, + { + "epoch": 3.594373568858358, + "grad_norm": 0.7313148975372314, + "learning_rate": 8.322446505938746e-05, + "loss": 0.0686, + "step": 54940 + }, + { + "epoch": 3.5950278050376188, + "grad_norm": 0.7917100787162781, + "learning_rate": 8.321759986360687e-05, + "loss": 0.0706, + "step": 54950 + }, + { + "epoch": 3.5956820412168793, + "grad_norm": 1.0193794965744019, + "learning_rate": 8.321073354664516e-05, + "loss": 0.0816, + "step": 54960 + }, + { + "epoch": 3.59633627739614, + "grad_norm": 0.9962078332901001, + "learning_rate": 8.32038661087341e-05, + "loss": 0.0657, + "step": 54970 + }, + { + "epoch": 3.596990513575401, + "grad_norm": 0.8794001340866089, + "learning_rate": 8.319699755010549e-05, + "loss": 0.076, + "step": 54980 + }, + { + "epoch": 3.5976447497546613, + "grad_norm": 0.8249982595443726, + "learning_rate": 8.319012787099115e-05, + "loss": 0.0815, + "step": 54990 + }, + { + "epoch": 3.5982989859339223, + "grad_norm": 0.9822542667388916, + "learning_rate": 8.318325707162293e-05, + "loss": 0.0676, + "step": 55000 + }, + { + "epoch": 3.598953222113183, + "grad_norm": 0.8483203053474426, + "learning_rate": 8.317638515223277e-05, + "loss": 0.0686, + "step": 55010 + }, + { + "epoch": 3.5996074582924438, + "grad_norm": 0.8871238827705383, + "learning_rate": 8.31695121130526e-05, + "loss": 0.0722, + "step": 55020 + }, + { + "epoch": 3.6002616944717043, + "grad_norm": 1.047977089881897, + "learning_rate": 8.31626379543144e-05, + "loss": 0.0836, + "step": 55030 + }, + { + "epoch": 3.600915930650965, + "grad_norm": 0.9134694933891296, + "learning_rate": 8.31557626762502e-05, + "loss": 0.0733, + "step": 55040 + }, + { + "epoch": 3.601570166830226, + "grad_norm": 0.9001691341400146, + "learning_rate": 8.314888627909208e-05, + "loss": 0.0687, + "step": 55050 + }, + { + "epoch": 3.6022244030094863, + "grad_norm": 0.8247069716453552, + "learning_rate": 8.31420087630721e-05, + "loss": 0.0676, + "step": 55060 + }, + { + "epoch": 3.6028786391887473, + "grad_norm": 0.8471789360046387, + "learning_rate": 8.313513012842238e-05, + "loss": 0.0694, + "step": 55070 + }, + { + "epoch": 3.603532875368008, + "grad_norm": 0.7431899905204773, + "learning_rate": 8.312825037537513e-05, + "loss": 0.0741, + "step": 55080 + }, + { + "epoch": 3.604187111547269, + "grad_norm": 0.9894888997077942, + "learning_rate": 8.312136950416256e-05, + "loss": 0.0754, + "step": 55090 + }, + { + "epoch": 3.6048413477265293, + "grad_norm": 0.8959627747535706, + "learning_rate": 8.311448751501689e-05, + "loss": 0.0798, + "step": 55100 + }, + { + "epoch": 3.60549558390579, + "grad_norm": 0.7383559942245483, + "learning_rate": 8.310760440817043e-05, + "loss": 0.0726, + "step": 55110 + }, + { + "epoch": 3.606149820085051, + "grad_norm": 0.8835324645042419, + "learning_rate": 8.31007201838555e-05, + "loss": 0.0751, + "step": 55120 + }, + { + "epoch": 3.6068040562643113, + "grad_norm": 0.8275827169418335, + "learning_rate": 8.309383484230446e-05, + "loss": 0.064, + "step": 55130 + }, + { + "epoch": 3.607458292443572, + "grad_norm": 0.8732197880744934, + "learning_rate": 8.308694838374969e-05, + "loss": 0.0656, + "step": 55140 + }, + { + "epoch": 3.608112528622833, + "grad_norm": 1.2900716066360474, + "learning_rate": 8.308006080842362e-05, + "loss": 0.0748, + "step": 55150 + }, + { + "epoch": 3.608766764802094, + "grad_norm": 0.8810641169548035, + "learning_rate": 8.307317211655877e-05, + "loss": 0.07, + "step": 55160 + }, + { + "epoch": 3.6094210009813543, + "grad_norm": 0.9505282640457153, + "learning_rate": 8.30662823083876e-05, + "loss": 0.0701, + "step": 55170 + }, + { + "epoch": 3.610075237160615, + "grad_norm": 1.0817748308181763, + "learning_rate": 8.30593913841427e-05, + "loss": 0.0761, + "step": 55180 + }, + { + "epoch": 3.610729473339876, + "grad_norm": 0.8153062462806702, + "learning_rate": 8.305249934405664e-05, + "loss": 0.0643, + "step": 55190 + }, + { + "epoch": 3.6113837095191363, + "grad_norm": 0.8734539747238159, + "learning_rate": 8.304560618836204e-05, + "loss": 0.0721, + "step": 55200 + }, + { + "epoch": 3.612037945698397, + "grad_norm": 0.8349213004112244, + "learning_rate": 8.303871191729156e-05, + "loss": 0.0701, + "step": 55210 + }, + { + "epoch": 3.612692181877658, + "grad_norm": 0.8556681871414185, + "learning_rate": 8.303181653107791e-05, + "loss": 0.064, + "step": 55220 + }, + { + "epoch": 3.613346418056919, + "grad_norm": 1.0858343839645386, + "learning_rate": 8.302492002995383e-05, + "loss": 0.0683, + "step": 55230 + }, + { + "epoch": 3.6140006542361793, + "grad_norm": 0.6986839175224304, + "learning_rate": 8.301802241415209e-05, + "loss": 0.0651, + "step": 55240 + }, + { + "epoch": 3.61465489041544, + "grad_norm": 0.9676486849784851, + "learning_rate": 8.301112368390548e-05, + "loss": 0.067, + "step": 55250 + }, + { + "epoch": 3.615309126594701, + "grad_norm": 1.1534702777862549, + "learning_rate": 8.300422383944688e-05, + "loss": 0.0665, + "step": 55260 + }, + { + "epoch": 3.6159633627739614, + "grad_norm": 0.9202556014060974, + "learning_rate": 8.299732288100918e-05, + "loss": 0.0648, + "step": 55270 + }, + { + "epoch": 3.616617598953222, + "grad_norm": 0.9827196598052979, + "learning_rate": 8.299042080882528e-05, + "loss": 0.0777, + "step": 55280 + }, + { + "epoch": 3.617271835132483, + "grad_norm": 0.9134148359298706, + "learning_rate": 8.298351762312816e-05, + "loss": 0.0693, + "step": 55290 + }, + { + "epoch": 3.6179260713117434, + "grad_norm": 0.7244698405265808, + "learning_rate": 8.297661332415083e-05, + "loss": 0.0672, + "step": 55300 + }, + { + "epoch": 3.6185803074910043, + "grad_norm": 0.9606476426124573, + "learning_rate": 8.296970791212631e-05, + "loss": 0.0731, + "step": 55310 + }, + { + "epoch": 3.619234543670265, + "grad_norm": 0.9693164825439453, + "learning_rate": 8.296280138728768e-05, + "loss": 0.07, + "step": 55320 + }, + { + "epoch": 3.619888779849526, + "grad_norm": 0.8021335005760193, + "learning_rate": 8.295589374986804e-05, + "loss": 0.07, + "step": 55330 + }, + { + "epoch": 3.6205430160287864, + "grad_norm": 1.0578341484069824, + "learning_rate": 8.294898500010056e-05, + "loss": 0.0635, + "step": 55340 + }, + { + "epoch": 3.621197252208047, + "grad_norm": 0.8854939341545105, + "learning_rate": 8.294207513821845e-05, + "loss": 0.0651, + "step": 55350 + }, + { + "epoch": 3.621851488387308, + "grad_norm": 0.8481228351593018, + "learning_rate": 8.293516416445488e-05, + "loss": 0.0743, + "step": 55360 + }, + { + "epoch": 3.6225057245665684, + "grad_norm": 0.8567721247673035, + "learning_rate": 8.292825207904316e-05, + "loss": 0.0671, + "step": 55370 + }, + { + "epoch": 3.6231599607458294, + "grad_norm": 0.7755191922187805, + "learning_rate": 8.292133888221659e-05, + "loss": 0.0683, + "step": 55380 + }, + { + "epoch": 3.62381419692509, + "grad_norm": 1.062867522239685, + "learning_rate": 8.291442457420846e-05, + "loss": 0.0778, + "step": 55390 + }, + { + "epoch": 3.624468433104351, + "grad_norm": 0.8446351885795593, + "learning_rate": 8.290750915525219e-05, + "loss": 0.0645, + "step": 55400 + }, + { + "epoch": 3.6251226692836114, + "grad_norm": 0.8222762942314148, + "learning_rate": 8.290059262558119e-05, + "loss": 0.0755, + "step": 55410 + }, + { + "epoch": 3.625776905462872, + "grad_norm": 0.9399179220199585, + "learning_rate": 8.28936749854289e-05, + "loss": 0.0672, + "step": 55420 + }, + { + "epoch": 3.626431141642133, + "grad_norm": 0.7565219402313232, + "learning_rate": 8.288675623502881e-05, + "loss": 0.065, + "step": 55430 + }, + { + "epoch": 3.6270853778213934, + "grad_norm": 0.8849060535430908, + "learning_rate": 8.287983637461447e-05, + "loss": 0.0748, + "step": 55440 + }, + { + "epoch": 3.6277396140006544, + "grad_norm": 0.8676406145095825, + "learning_rate": 8.28729154044194e-05, + "loss": 0.0742, + "step": 55450 + }, + { + "epoch": 3.628393850179915, + "grad_norm": 0.7463305592536926, + "learning_rate": 8.286599332467722e-05, + "loss": 0.0722, + "step": 55460 + }, + { + "epoch": 3.629048086359176, + "grad_norm": 0.9079608917236328, + "learning_rate": 8.285907013562158e-05, + "loss": 0.0723, + "step": 55470 + }, + { + "epoch": 3.6297023225384364, + "grad_norm": 0.9543100595474243, + "learning_rate": 8.285214583748616e-05, + "loss": 0.0726, + "step": 55480 + }, + { + "epoch": 3.630356558717697, + "grad_norm": 0.9939130544662476, + "learning_rate": 8.284522043050463e-05, + "loss": 0.0679, + "step": 55490 + }, + { + "epoch": 3.631010794896958, + "grad_norm": 0.9271246194839478, + "learning_rate": 8.28382939149108e-05, + "loss": 0.0642, + "step": 55500 + }, + { + "epoch": 3.6316650310762184, + "grad_norm": 0.9283300042152405, + "learning_rate": 8.283136629093841e-05, + "loss": 0.067, + "step": 55510 + }, + { + "epoch": 3.6323192672554794, + "grad_norm": 0.9301807284355164, + "learning_rate": 8.28244375588213e-05, + "loss": 0.07, + "step": 55520 + }, + { + "epoch": 3.63297350343474, + "grad_norm": 0.9366194605827332, + "learning_rate": 8.281750771879335e-05, + "loss": 0.0707, + "step": 55530 + }, + { + "epoch": 3.633627739614001, + "grad_norm": 0.9586436748504639, + "learning_rate": 8.281057677108844e-05, + "loss": 0.0641, + "step": 55540 + }, + { + "epoch": 3.6342819757932614, + "grad_norm": 0.8174638748168945, + "learning_rate": 8.280364471594052e-05, + "loss": 0.0686, + "step": 55550 + }, + { + "epoch": 3.634936211972522, + "grad_norm": 0.9204336404800415, + "learning_rate": 8.279671155358355e-05, + "loss": 0.066, + "step": 55560 + }, + { + "epoch": 3.635590448151783, + "grad_norm": 0.7173357009887695, + "learning_rate": 8.278977728425157e-05, + "loss": 0.0668, + "step": 55570 + }, + { + "epoch": 3.6362446843310434, + "grad_norm": 0.8251804113388062, + "learning_rate": 8.27828419081786e-05, + "loss": 0.0689, + "step": 55580 + }, + { + "epoch": 3.636898920510304, + "grad_norm": 0.796325147151947, + "learning_rate": 8.277590542559875e-05, + "loss": 0.0657, + "step": 55590 + }, + { + "epoch": 3.637553156689565, + "grad_norm": 0.8071014881134033, + "learning_rate": 8.276896783674612e-05, + "loss": 0.0692, + "step": 55600 + }, + { + "epoch": 3.638207392868826, + "grad_norm": 0.8185661435127258, + "learning_rate": 8.27620291418549e-05, + "loss": 0.0657, + "step": 55610 + }, + { + "epoch": 3.6388616290480864, + "grad_norm": 0.800986111164093, + "learning_rate": 8.275508934115927e-05, + "loss": 0.0719, + "step": 55620 + }, + { + "epoch": 3.639515865227347, + "grad_norm": 0.9986090660095215, + "learning_rate": 8.274814843489346e-05, + "loss": 0.0764, + "step": 55630 + }, + { + "epoch": 3.640170101406608, + "grad_norm": 0.688541829586029, + "learning_rate": 8.274120642329178e-05, + "loss": 0.0721, + "step": 55640 + }, + { + "epoch": 3.6408243375858684, + "grad_norm": 1.0835293531417847, + "learning_rate": 8.273426330658849e-05, + "loss": 0.0819, + "step": 55650 + }, + { + "epoch": 3.641478573765129, + "grad_norm": 0.7599365711212158, + "learning_rate": 8.272731908501798e-05, + "loss": 0.075, + "step": 55660 + }, + { + "epoch": 3.64213280994439, + "grad_norm": 1.0994549989700317, + "learning_rate": 8.272037375881461e-05, + "loss": 0.0601, + "step": 55670 + }, + { + "epoch": 3.642787046123651, + "grad_norm": 0.8615381121635437, + "learning_rate": 8.27134273282128e-05, + "loss": 0.0733, + "step": 55680 + }, + { + "epoch": 3.6434412823029114, + "grad_norm": 1.0048024654388428, + "learning_rate": 8.270647979344706e-05, + "loss": 0.0808, + "step": 55690 + }, + { + "epoch": 3.644095518482172, + "grad_norm": 1.1392261981964111, + "learning_rate": 8.269953115475183e-05, + "loss": 0.0863, + "step": 55700 + }, + { + "epoch": 3.644749754661433, + "grad_norm": 1.387350082397461, + "learning_rate": 8.269258141236167e-05, + "loss": 0.083, + "step": 55710 + }, + { + "epoch": 3.6454039908406934, + "grad_norm": 0.8560718894004822, + "learning_rate": 8.268563056651115e-05, + "loss": 0.0751, + "step": 55720 + }, + { + "epoch": 3.646058227019954, + "grad_norm": 0.892548143863678, + "learning_rate": 8.267867861743488e-05, + "loss": 0.0709, + "step": 55730 + }, + { + "epoch": 3.646712463199215, + "grad_norm": 0.8326752185821533, + "learning_rate": 8.267172556536748e-05, + "loss": 0.0713, + "step": 55740 + }, + { + "epoch": 3.647366699378476, + "grad_norm": 0.7968476414680481, + "learning_rate": 8.26647714105437e-05, + "loss": 0.0821, + "step": 55750 + }, + { + "epoch": 3.6480209355577364, + "grad_norm": 0.8511175513267517, + "learning_rate": 8.265781615319818e-05, + "loss": 0.0745, + "step": 55760 + }, + { + "epoch": 3.648675171736997, + "grad_norm": 0.9187440872192383, + "learning_rate": 8.265085979356573e-05, + "loss": 0.0699, + "step": 55770 + }, + { + "epoch": 3.649329407916258, + "grad_norm": 0.8875986933708191, + "learning_rate": 8.264390233188113e-05, + "loss": 0.0783, + "step": 55780 + }, + { + "epoch": 3.6499836440955185, + "grad_norm": 0.9916920065879822, + "learning_rate": 8.263694376837923e-05, + "loss": 0.0637, + "step": 55790 + }, + { + "epoch": 3.650637880274779, + "grad_norm": 0.9589375853538513, + "learning_rate": 8.262998410329486e-05, + "loss": 0.0643, + "step": 55800 + }, + { + "epoch": 3.65129211645404, + "grad_norm": 0.9402400851249695, + "learning_rate": 8.262302333686296e-05, + "loss": 0.0665, + "step": 55810 + }, + { + "epoch": 3.6519463526333005, + "grad_norm": 0.9499921798706055, + "learning_rate": 8.261606146931846e-05, + "loss": 0.08, + "step": 55820 + }, + { + "epoch": 3.6526005888125614, + "grad_norm": 1.1458688974380493, + "learning_rate": 8.260909850089636e-05, + "loss": 0.0697, + "step": 55830 + }, + { + "epoch": 3.653254824991822, + "grad_norm": 0.8103237152099609, + "learning_rate": 8.260213443183167e-05, + "loss": 0.0725, + "step": 55840 + }, + { + "epoch": 3.653909061171083, + "grad_norm": 0.7244288921356201, + "learning_rate": 8.259516926235942e-05, + "loss": 0.0653, + "step": 55850 + }, + { + "epoch": 3.6545632973503435, + "grad_norm": 0.8501254320144653, + "learning_rate": 8.258820299271475e-05, + "loss": 0.0662, + "step": 55860 + }, + { + "epoch": 3.655217533529604, + "grad_norm": 0.9807097911834717, + "learning_rate": 8.258123562313274e-05, + "loss": 0.0734, + "step": 55870 + }, + { + "epoch": 3.655871769708865, + "grad_norm": 0.9171924591064453, + "learning_rate": 8.257426715384859e-05, + "loss": 0.0711, + "step": 55880 + }, + { + "epoch": 3.6565260058881255, + "grad_norm": 0.7962984442710876, + "learning_rate": 8.256729758509748e-05, + "loss": 0.0753, + "step": 55890 + }, + { + "epoch": 3.6571802420673865, + "grad_norm": 0.7744097709655762, + "learning_rate": 8.256032691711469e-05, + "loss": 0.075, + "step": 55900 + }, + { + "epoch": 3.657834478246647, + "grad_norm": 0.8903494477272034, + "learning_rate": 8.255335515013545e-05, + "loss": 0.0681, + "step": 55910 + }, + { + "epoch": 3.658488714425908, + "grad_norm": 0.8612160682678223, + "learning_rate": 8.25463822843951e-05, + "loss": 0.0739, + "step": 55920 + }, + { + "epoch": 3.6591429506051685, + "grad_norm": 1.115964651107788, + "learning_rate": 8.253940832012901e-05, + "loss": 0.0776, + "step": 55930 + }, + { + "epoch": 3.659797186784429, + "grad_norm": 0.90339195728302, + "learning_rate": 8.253243325757255e-05, + "loss": 0.0724, + "step": 55940 + }, + { + "epoch": 3.66045142296369, + "grad_norm": 0.8702420592308044, + "learning_rate": 8.252545709696114e-05, + "loss": 0.0617, + "step": 55950 + }, + { + "epoch": 3.6611056591429505, + "grad_norm": 0.7411156892776489, + "learning_rate": 8.251847983853025e-05, + "loss": 0.0724, + "step": 55960 + }, + { + "epoch": 3.6617598953222115, + "grad_norm": 0.856332540512085, + "learning_rate": 8.251150148251538e-05, + "loss": 0.0616, + "step": 55970 + }, + { + "epoch": 3.662414131501472, + "grad_norm": 0.808392345905304, + "learning_rate": 8.250452202915209e-05, + "loss": 0.0727, + "step": 55980 + }, + { + "epoch": 3.663068367680733, + "grad_norm": 0.7766302227973938, + "learning_rate": 8.249754147867592e-05, + "loss": 0.0691, + "step": 55990 + }, + { + "epoch": 3.6637226038599935, + "grad_norm": 0.782347559928894, + "learning_rate": 8.24905598313225e-05, + "loss": 0.0603, + "step": 56000 + }, + { + "epoch": 3.664376840039254, + "grad_norm": 0.721568763256073, + "learning_rate": 8.248357708732749e-05, + "loss": 0.0681, + "step": 56010 + }, + { + "epoch": 3.665031076218515, + "grad_norm": 0.8713940382003784, + "learning_rate": 8.247659324692653e-05, + "loss": 0.0746, + "step": 56020 + }, + { + "epoch": 3.6656853123977755, + "grad_norm": 0.958366334438324, + "learning_rate": 8.246960831035539e-05, + "loss": 0.0689, + "step": 56030 + }, + { + "epoch": 3.666339548577036, + "grad_norm": 1.0349668264389038, + "learning_rate": 8.246262227784982e-05, + "loss": 0.074, + "step": 56040 + }, + { + "epoch": 3.666993784756297, + "grad_norm": 0.8951663374900818, + "learning_rate": 8.245563514964562e-05, + "loss": 0.0762, + "step": 56050 + }, + { + "epoch": 3.667648020935558, + "grad_norm": 0.993411660194397, + "learning_rate": 8.24486469259786e-05, + "loss": 0.0689, + "step": 56060 + }, + { + "epoch": 3.6683022571148185, + "grad_norm": 0.9579418897628784, + "learning_rate": 8.244165760708464e-05, + "loss": 0.0658, + "step": 56070 + }, + { + "epoch": 3.668956493294079, + "grad_norm": 0.8194859623908997, + "learning_rate": 8.243466719319967e-05, + "loss": 0.0703, + "step": 56080 + }, + { + "epoch": 3.66961072947334, + "grad_norm": 0.8599490523338318, + "learning_rate": 8.242767568455963e-05, + "loss": 0.0615, + "step": 56090 + }, + { + "epoch": 3.6702649656526005, + "grad_norm": 0.7916556596755981, + "learning_rate": 8.242068308140047e-05, + "loss": 0.0769, + "step": 56100 + }, + { + "epoch": 3.670919201831861, + "grad_norm": 0.8496348857879639, + "learning_rate": 8.241368938395824e-05, + "loss": 0.066, + "step": 56110 + }, + { + "epoch": 3.671573438011122, + "grad_norm": 0.7955387234687805, + "learning_rate": 8.240669459246897e-05, + "loss": 0.0701, + "step": 56120 + }, + { + "epoch": 3.672227674190383, + "grad_norm": 0.7705776691436768, + "learning_rate": 8.239969870716878e-05, + "loss": 0.0636, + "step": 56130 + }, + { + "epoch": 3.6728819103696435, + "grad_norm": 1.0588468313217163, + "learning_rate": 8.239270172829379e-05, + "loss": 0.0748, + "step": 56140 + }, + { + "epoch": 3.673536146548904, + "grad_norm": 0.9020249247550964, + "learning_rate": 8.238570365608016e-05, + "loss": 0.0689, + "step": 56150 + }, + { + "epoch": 3.674190382728165, + "grad_norm": 0.7228168845176697, + "learning_rate": 8.237870449076411e-05, + "loss": 0.0632, + "step": 56160 + }, + { + "epoch": 3.6748446189074255, + "grad_norm": 0.729026198387146, + "learning_rate": 8.237170423258184e-05, + "loss": 0.0761, + "step": 56170 + }, + { + "epoch": 3.675498855086686, + "grad_norm": 0.7430927157402039, + "learning_rate": 8.236470288176966e-05, + "loss": 0.0644, + "step": 56180 + }, + { + "epoch": 3.676153091265947, + "grad_norm": 0.8580201268196106, + "learning_rate": 8.235770043856389e-05, + "loss": 0.0615, + "step": 56190 + }, + { + "epoch": 3.676807327445208, + "grad_norm": 1.0346200466156006, + "learning_rate": 8.235069690320087e-05, + "loss": 0.079, + "step": 56200 + }, + { + "epoch": 3.6774615636244685, + "grad_norm": 1.0010615587234497, + "learning_rate": 8.234369227591698e-05, + "loss": 0.0673, + "step": 56210 + }, + { + "epoch": 3.678115799803729, + "grad_norm": 0.8165103793144226, + "learning_rate": 8.233668655694865e-05, + "loss": 0.0738, + "step": 56220 + }, + { + "epoch": 3.67877003598299, + "grad_norm": 0.8358020186424255, + "learning_rate": 8.232967974653235e-05, + "loss": 0.0637, + "step": 56230 + }, + { + "epoch": 3.6794242721622505, + "grad_norm": 0.8464856743812561, + "learning_rate": 8.232267184490457e-05, + "loss": 0.0696, + "step": 56240 + }, + { + "epoch": 3.680078508341511, + "grad_norm": 0.8467486500740051, + "learning_rate": 8.231566285230183e-05, + "loss": 0.0697, + "step": 56250 + }, + { + "epoch": 3.680732744520772, + "grad_norm": 0.9639089703559875, + "learning_rate": 8.230865276896074e-05, + "loss": 0.0811, + "step": 56260 + }, + { + "epoch": 3.6813869807000326, + "grad_norm": 0.7469546794891357, + "learning_rate": 8.230164159511788e-05, + "loss": 0.072, + "step": 56270 + }, + { + "epoch": 3.6820412168792935, + "grad_norm": 0.9208206534385681, + "learning_rate": 8.22946293310099e-05, + "loss": 0.0836, + "step": 56280 + }, + { + "epoch": 3.682695453058554, + "grad_norm": 0.7601810693740845, + "learning_rate": 8.228761597687348e-05, + "loss": 0.071, + "step": 56290 + }, + { + "epoch": 3.683349689237815, + "grad_norm": 0.8234832286834717, + "learning_rate": 8.228060153294534e-05, + "loss": 0.0734, + "step": 56300 + }, + { + "epoch": 3.6840039254170756, + "grad_norm": 0.8467697501182556, + "learning_rate": 8.227358599946224e-05, + "loss": 0.0743, + "step": 56310 + }, + { + "epoch": 3.684658161596336, + "grad_norm": 0.6788933873176575, + "learning_rate": 8.2266569376661e-05, + "loss": 0.0639, + "step": 56320 + }, + { + "epoch": 3.685312397775597, + "grad_norm": 0.9612674713134766, + "learning_rate": 8.225955166477839e-05, + "loss": 0.0667, + "step": 56330 + }, + { + "epoch": 3.6859666339548576, + "grad_norm": 0.8955544233322144, + "learning_rate": 8.22525328640513e-05, + "loss": 0.0671, + "step": 56340 + }, + { + "epoch": 3.6866208701341185, + "grad_norm": 0.9106106758117676, + "learning_rate": 8.224551297471668e-05, + "loss": 0.0695, + "step": 56350 + }, + { + "epoch": 3.687275106313379, + "grad_norm": 0.8671728372573853, + "learning_rate": 8.223849199701139e-05, + "loss": 0.06, + "step": 56360 + }, + { + "epoch": 3.68792934249264, + "grad_norm": 0.9457067251205444, + "learning_rate": 8.223146993117247e-05, + "loss": 0.0762, + "step": 56370 + }, + { + "epoch": 3.6885835786719006, + "grad_norm": 1.0926858186721802, + "learning_rate": 8.222444677743691e-05, + "loss": 0.068, + "step": 56380 + }, + { + "epoch": 3.689237814851161, + "grad_norm": 0.964023768901825, + "learning_rate": 8.221742253604175e-05, + "loss": 0.0635, + "step": 56390 + }, + { + "epoch": 3.689892051030422, + "grad_norm": 0.8525315523147583, + "learning_rate": 8.221039720722409e-05, + "loss": 0.0806, + "step": 56400 + }, + { + "epoch": 3.6905462872096826, + "grad_norm": 0.989080548286438, + "learning_rate": 8.220337079122105e-05, + "loss": 0.0713, + "step": 56410 + }, + { + "epoch": 3.6912005233889436, + "grad_norm": 0.7577918171882629, + "learning_rate": 8.21963432882698e-05, + "loss": 0.0716, + "step": 56420 + }, + { + "epoch": 3.691854759568204, + "grad_norm": 0.8016083240509033, + "learning_rate": 8.21893146986075e-05, + "loss": 0.071, + "step": 56430 + }, + { + "epoch": 3.692508995747465, + "grad_norm": 0.7300130128860474, + "learning_rate": 8.218228502247141e-05, + "loss": 0.073, + "step": 56440 + }, + { + "epoch": 3.6931632319267256, + "grad_norm": 0.8202305436134338, + "learning_rate": 8.217525426009882e-05, + "loss": 0.0685, + "step": 56450 + }, + { + "epoch": 3.693817468105986, + "grad_norm": 0.9385225176811218, + "learning_rate": 8.216822241172702e-05, + "loss": 0.0751, + "step": 56460 + }, + { + "epoch": 3.694471704285247, + "grad_norm": 1.1209096908569336, + "learning_rate": 8.216118947759333e-05, + "loss": 0.0709, + "step": 56470 + }, + { + "epoch": 3.6951259404645076, + "grad_norm": 1.0361253023147583, + "learning_rate": 8.215415545793515e-05, + "loss": 0.0765, + "step": 56480 + }, + { + "epoch": 3.6957801766437686, + "grad_norm": 0.8918325901031494, + "learning_rate": 8.214712035298991e-05, + "loss": 0.0622, + "step": 56490 + }, + { + "epoch": 3.696434412823029, + "grad_norm": 0.7726563811302185, + "learning_rate": 8.214008416299505e-05, + "loss": 0.0717, + "step": 56500 + }, + { + "epoch": 3.69708864900229, + "grad_norm": 0.9196798801422119, + "learning_rate": 8.213304688818804e-05, + "loss": 0.081, + "step": 56510 + }, + { + "epoch": 3.6977428851815506, + "grad_norm": 1.0043820142745972, + "learning_rate": 8.212600852880644e-05, + "loss": 0.0756, + "step": 56520 + }, + { + "epoch": 3.698397121360811, + "grad_norm": 0.926852285861969, + "learning_rate": 8.21189690850878e-05, + "loss": 0.0677, + "step": 56530 + }, + { + "epoch": 3.699051357540072, + "grad_norm": 0.8300206661224365, + "learning_rate": 8.211192855726972e-05, + "loss": 0.074, + "step": 56540 + }, + { + "epoch": 3.6997055937193326, + "grad_norm": 1.1823694705963135, + "learning_rate": 8.210488694558982e-05, + "loss": 0.0653, + "step": 56550 + }, + { + "epoch": 3.700359829898593, + "grad_norm": 0.7642592787742615, + "learning_rate": 8.20978442502858e-05, + "loss": 0.0634, + "step": 56560 + }, + { + "epoch": 3.701014066077854, + "grad_norm": 0.9200793504714966, + "learning_rate": 8.209080047159535e-05, + "loss": 0.0862, + "step": 56570 + }, + { + "epoch": 3.701668302257115, + "grad_norm": 0.8010613918304443, + "learning_rate": 8.208375560975624e-05, + "loss": 0.0636, + "step": 56580 + }, + { + "epoch": 3.7023225384363756, + "grad_norm": 0.7096419930458069, + "learning_rate": 8.207670966500621e-05, + "loss": 0.065, + "step": 56590 + }, + { + "epoch": 3.702976774615636, + "grad_norm": 1.8889129161834717, + "learning_rate": 8.206966263758311e-05, + "loss": 0.0617, + "step": 56600 + }, + { + "epoch": 3.703631010794897, + "grad_norm": 0.7090383172035217, + "learning_rate": 8.20626145277248e-05, + "loss": 0.0718, + "step": 56610 + }, + { + "epoch": 3.7042852469741576, + "grad_norm": 0.8787170648574829, + "learning_rate": 8.205556533566916e-05, + "loss": 0.0734, + "step": 56620 + }, + { + "epoch": 3.704939483153418, + "grad_norm": 0.8815798759460449, + "learning_rate": 8.204851506165412e-05, + "loss": 0.0692, + "step": 56630 + }, + { + "epoch": 3.705593719332679, + "grad_norm": 0.6900599002838135, + "learning_rate": 8.204146370591765e-05, + "loss": 0.0632, + "step": 56640 + }, + { + "epoch": 3.70624795551194, + "grad_norm": 0.9211147427558899, + "learning_rate": 8.203441126869773e-05, + "loss": 0.0785, + "step": 56650 + }, + { + "epoch": 3.7069021916912006, + "grad_norm": 0.6548269987106323, + "learning_rate": 8.202735775023244e-05, + "loss": 0.0694, + "step": 56660 + }, + { + "epoch": 3.707556427870461, + "grad_norm": 0.7686384916305542, + "learning_rate": 8.202030315075982e-05, + "loss": 0.0664, + "step": 56670 + }, + { + "epoch": 3.708210664049722, + "grad_norm": 0.7667417526245117, + "learning_rate": 8.2013247470518e-05, + "loss": 0.06, + "step": 56680 + }, + { + "epoch": 3.7088649002289826, + "grad_norm": 0.70942622423172, + "learning_rate": 8.200619070974512e-05, + "loss": 0.0621, + "step": 56690 + }, + { + "epoch": 3.709519136408243, + "grad_norm": 0.9921366572380066, + "learning_rate": 8.199913286867934e-05, + "loss": 0.0727, + "step": 56700 + }, + { + "epoch": 3.710173372587504, + "grad_norm": 0.9375357031822205, + "learning_rate": 8.199207394755893e-05, + "loss": 0.0697, + "step": 56710 + }, + { + "epoch": 3.7108276087667647, + "grad_norm": 0.9243108034133911, + "learning_rate": 8.198501394662212e-05, + "loss": 0.0649, + "step": 56720 + }, + { + "epoch": 3.7114818449460256, + "grad_norm": 0.9306524395942688, + "learning_rate": 8.197795286610719e-05, + "loss": 0.0714, + "step": 56730 + }, + { + "epoch": 3.712136081125286, + "grad_norm": 1.01764976978302, + "learning_rate": 8.19708907062525e-05, + "loss": 0.0695, + "step": 56740 + }, + { + "epoch": 3.712790317304547, + "grad_norm": 0.7517834901809692, + "learning_rate": 8.196382746729641e-05, + "loss": 0.0747, + "step": 56750 + }, + { + "epoch": 3.7134445534838076, + "grad_norm": 0.7786497473716736, + "learning_rate": 8.195676314947729e-05, + "loss": 0.0817, + "step": 56760 + }, + { + "epoch": 3.714098789663068, + "grad_norm": 0.939345121383667, + "learning_rate": 8.194969775303361e-05, + "loss": 0.0732, + "step": 56770 + }, + { + "epoch": 3.714753025842329, + "grad_norm": 1.078481674194336, + "learning_rate": 8.194263127820385e-05, + "loss": 0.0724, + "step": 56780 + }, + { + "epoch": 3.7154072620215897, + "grad_norm": 0.9831727147102356, + "learning_rate": 8.19355637252265e-05, + "loss": 0.0663, + "step": 56790 + }, + { + "epoch": 3.7160614982008506, + "grad_norm": 0.8869754672050476, + "learning_rate": 8.192849509434014e-05, + "loss": 0.074, + "step": 56800 + }, + { + "epoch": 3.716715734380111, + "grad_norm": 0.7121245861053467, + "learning_rate": 8.192142538578331e-05, + "loss": 0.0716, + "step": 56810 + }, + { + "epoch": 3.717369970559372, + "grad_norm": 0.8707069754600525, + "learning_rate": 8.191435459979468e-05, + "loss": 0.0698, + "step": 56820 + }, + { + "epoch": 3.7180242067386327, + "grad_norm": 1.0101736783981323, + "learning_rate": 8.190728273661288e-05, + "loss": 0.0761, + "step": 56830 + }, + { + "epoch": 3.718678442917893, + "grad_norm": 0.8770877122879028, + "learning_rate": 8.190020979647659e-05, + "loss": 0.0802, + "step": 56840 + }, + { + "epoch": 3.719332679097154, + "grad_norm": 0.9432161450386047, + "learning_rate": 8.189313577962457e-05, + "loss": 0.0711, + "step": 56850 + }, + { + "epoch": 3.7199869152764147, + "grad_norm": 0.8004513382911682, + "learning_rate": 8.188606068629558e-05, + "loss": 0.0676, + "step": 56860 + }, + { + "epoch": 3.7206411514556756, + "grad_norm": 0.9776605367660522, + "learning_rate": 8.187898451672841e-05, + "loss": 0.0655, + "step": 56870 + }, + { + "epoch": 3.721295387634936, + "grad_norm": 0.774079442024231, + "learning_rate": 8.18719072711619e-05, + "loss": 0.0632, + "step": 56880 + }, + { + "epoch": 3.721949623814197, + "grad_norm": 0.9294254183769226, + "learning_rate": 8.186482894983494e-05, + "loss": 0.0656, + "step": 56890 + }, + { + "epoch": 3.7226038599934577, + "grad_norm": 0.9034769535064697, + "learning_rate": 8.185774955298644e-05, + "loss": 0.0775, + "step": 56900 + }, + { + "epoch": 3.723258096172718, + "grad_norm": 1.0133552551269531, + "learning_rate": 8.185066908085535e-05, + "loss": 0.0684, + "step": 56910 + }, + { + "epoch": 3.723912332351979, + "grad_norm": 1.045736312866211, + "learning_rate": 8.184358753368062e-05, + "loss": 0.0627, + "step": 56920 + }, + { + "epoch": 3.7245665685312397, + "grad_norm": 0.8517866730690002, + "learning_rate": 8.183650491170132e-05, + "loss": 0.0639, + "step": 56930 + }, + { + "epoch": 3.7252208047105007, + "grad_norm": 0.6794936060905457, + "learning_rate": 8.182942121515648e-05, + "loss": 0.0624, + "step": 56940 + }, + { + "epoch": 3.725875040889761, + "grad_norm": 0.8077151775360107, + "learning_rate": 8.182233644428518e-05, + "loss": 0.0797, + "step": 56950 + }, + { + "epoch": 3.726529277069022, + "grad_norm": 0.8589492440223694, + "learning_rate": 8.18152505993266e-05, + "loss": 0.0772, + "step": 56960 + }, + { + "epoch": 3.7271835132482827, + "grad_norm": 0.8781915307044983, + "learning_rate": 8.180816368051985e-05, + "loss": 0.0741, + "step": 56970 + }, + { + "epoch": 3.727837749427543, + "grad_norm": 0.91343092918396, + "learning_rate": 8.180107568810417e-05, + "loss": 0.0756, + "step": 56980 + }, + { + "epoch": 3.728491985606804, + "grad_norm": 0.8857640624046326, + "learning_rate": 8.179398662231876e-05, + "loss": 0.0808, + "step": 56990 + }, + { + "epoch": 3.7291462217860647, + "grad_norm": 0.807052493095398, + "learning_rate": 8.178689648340294e-05, + "loss": 0.0756, + "step": 57000 + }, + { + "epoch": 3.7298004579653252, + "grad_norm": 0.7484919428825378, + "learning_rate": 8.1779805271596e-05, + "loss": 0.066, + "step": 57010 + }, + { + "epoch": 3.730454694144586, + "grad_norm": 0.8383365869522095, + "learning_rate": 8.17727129871373e-05, + "loss": 0.0684, + "step": 57020 + }, + { + "epoch": 3.731108930323847, + "grad_norm": 0.9385350346565247, + "learning_rate": 8.176561963026618e-05, + "loss": 0.0805, + "step": 57030 + }, + { + "epoch": 3.7317631665031077, + "grad_norm": 0.7031242847442627, + "learning_rate": 8.17585252012221e-05, + "loss": 0.0684, + "step": 57040 + }, + { + "epoch": 3.732417402682368, + "grad_norm": 0.9507877826690674, + "learning_rate": 8.175142970024451e-05, + "loss": 0.0738, + "step": 57050 + }, + { + "epoch": 3.733071638861629, + "grad_norm": 0.9051758646965027, + "learning_rate": 8.174433312757289e-05, + "loss": 0.0642, + "step": 57060 + }, + { + "epoch": 3.7337258750408897, + "grad_norm": 0.8826471567153931, + "learning_rate": 8.173723548344675e-05, + "loss": 0.0764, + "step": 57070 + }, + { + "epoch": 3.7343801112201502, + "grad_norm": 0.886605978012085, + "learning_rate": 8.173013676810573e-05, + "loss": 0.0695, + "step": 57080 + }, + { + "epoch": 3.735034347399411, + "grad_norm": 0.7051117420196533, + "learning_rate": 8.172303698178935e-05, + "loss": 0.069, + "step": 57090 + }, + { + "epoch": 3.735688583578672, + "grad_norm": 0.8651570677757263, + "learning_rate": 8.171593612473727e-05, + "loss": 0.0696, + "step": 57100 + }, + { + "epoch": 3.7363428197579327, + "grad_norm": 0.9195097088813782, + "learning_rate": 8.170883419718917e-05, + "loss": 0.0791, + "step": 57110 + }, + { + "epoch": 3.7369970559371932, + "grad_norm": 0.835708498954773, + "learning_rate": 8.170173119938478e-05, + "loss": 0.0733, + "step": 57120 + }, + { + "epoch": 3.737651292116454, + "grad_norm": 0.8207480311393738, + "learning_rate": 8.16946271315638e-05, + "loss": 0.073, + "step": 57130 + }, + { + "epoch": 3.7383055282957147, + "grad_norm": 1.0654559135437012, + "learning_rate": 8.168752199396603e-05, + "loss": 0.0728, + "step": 57140 + }, + { + "epoch": 3.7389597644749752, + "grad_norm": 0.7882975339889526, + "learning_rate": 8.168041578683129e-05, + "loss": 0.0645, + "step": 57150 + }, + { + "epoch": 3.739614000654236, + "grad_norm": 0.7596569061279297, + "learning_rate": 8.167330851039945e-05, + "loss": 0.0689, + "step": 57160 + }, + { + "epoch": 3.7402682368334967, + "grad_norm": 1.1013258695602417, + "learning_rate": 8.166620016491038e-05, + "loss": 0.0647, + "step": 57170 + }, + { + "epoch": 3.7409224730127577, + "grad_norm": 0.8467496037483215, + "learning_rate": 8.165909075060398e-05, + "loss": 0.0658, + "step": 57180 + }, + { + "epoch": 3.7415767091920182, + "grad_norm": 0.7248758673667908, + "learning_rate": 8.165198026772025e-05, + "loss": 0.0705, + "step": 57190 + }, + { + "epoch": 3.742230945371279, + "grad_norm": 0.8981897234916687, + "learning_rate": 8.164486871649919e-05, + "loss": 0.074, + "step": 57200 + }, + { + "epoch": 3.7428851815505397, + "grad_norm": 0.8243358135223389, + "learning_rate": 8.16377560971808e-05, + "loss": 0.0758, + "step": 57210 + }, + { + "epoch": 3.7435394177298003, + "grad_norm": 0.7456525564193726, + "learning_rate": 8.163064241000517e-05, + "loss": 0.0586, + "step": 57220 + }, + { + "epoch": 3.7441936539090612, + "grad_norm": 0.9510387182235718, + "learning_rate": 8.162352765521243e-05, + "loss": 0.0635, + "step": 57230 + }, + { + "epoch": 3.7448478900883218, + "grad_norm": 0.9684381484985352, + "learning_rate": 8.161641183304267e-05, + "loss": 0.0621, + "step": 57240 + }, + { + "epoch": 3.7455021262675827, + "grad_norm": 0.8673816919326782, + "learning_rate": 8.16092949437361e-05, + "loss": 0.0764, + "step": 57250 + }, + { + "epoch": 3.7461563624468432, + "grad_norm": 0.7953469157218933, + "learning_rate": 8.160217698753291e-05, + "loss": 0.0687, + "step": 57260 + }, + { + "epoch": 3.746810598626104, + "grad_norm": 0.7955390214920044, + "learning_rate": 8.159505796467342e-05, + "loss": 0.0706, + "step": 57270 + }, + { + "epoch": 3.7474648348053647, + "grad_norm": 0.8372431397438049, + "learning_rate": 8.158793787539782e-05, + "loss": 0.0737, + "step": 57280 + }, + { + "epoch": 3.7481190709846253, + "grad_norm": 1.1341184377670288, + "learning_rate": 8.158081671994648e-05, + "loss": 0.0737, + "step": 57290 + }, + { + "epoch": 3.7487733071638862, + "grad_norm": 0.8782973289489746, + "learning_rate": 8.157369449855974e-05, + "loss": 0.0708, + "step": 57300 + }, + { + "epoch": 3.7494275433431468, + "grad_norm": 0.9155938625335693, + "learning_rate": 8.156657121147803e-05, + "loss": 0.0695, + "step": 57310 + }, + { + "epoch": 3.7500817795224077, + "grad_norm": 0.7682180404663086, + "learning_rate": 8.155944685894175e-05, + "loss": 0.0709, + "step": 57320 + }, + { + "epoch": 3.7507360157016683, + "grad_norm": 0.9652989506721497, + "learning_rate": 8.155232144119135e-05, + "loss": 0.0628, + "step": 57330 + }, + { + "epoch": 3.7513902518809292, + "grad_norm": 1.136732578277588, + "learning_rate": 8.154519495846737e-05, + "loss": 0.0689, + "step": 57340 + }, + { + "epoch": 3.7520444880601898, + "grad_norm": 0.9039020538330078, + "learning_rate": 8.153806741101033e-05, + "loss": 0.0792, + "step": 57350 + }, + { + "epoch": 3.7526987242394503, + "grad_norm": 0.8383570313453674, + "learning_rate": 8.15309387990608e-05, + "loss": 0.0765, + "step": 57360 + }, + { + "epoch": 3.7533529604187112, + "grad_norm": 0.9474335312843323, + "learning_rate": 8.15238091228594e-05, + "loss": 0.0641, + "step": 57370 + }, + { + "epoch": 3.7540071965979718, + "grad_norm": 0.7904636859893799, + "learning_rate": 8.151667838264677e-05, + "loss": 0.0684, + "step": 57380 + }, + { + "epoch": 3.7546614327772327, + "grad_norm": 0.861436128616333, + "learning_rate": 8.150954657866356e-05, + "loss": 0.0669, + "step": 57390 + }, + { + "epoch": 3.7553156689564933, + "grad_norm": 0.7418460249900818, + "learning_rate": 8.150241371115055e-05, + "loss": 0.0683, + "step": 57400 + }, + { + "epoch": 3.7559699051357542, + "grad_norm": 0.7864526510238647, + "learning_rate": 8.149527978034844e-05, + "loss": 0.0677, + "step": 57410 + }, + { + "epoch": 3.7566241413150148, + "grad_norm": 0.8774205446243286, + "learning_rate": 8.148814478649805e-05, + "loss": 0.0747, + "step": 57420 + }, + { + "epoch": 3.7572783774942753, + "grad_norm": 0.7475970387458801, + "learning_rate": 8.148100872984019e-05, + "loss": 0.0685, + "step": 57430 + }, + { + "epoch": 3.7579326136735363, + "grad_norm": 1.0144479274749756, + "learning_rate": 8.14738716106157e-05, + "loss": 0.0736, + "step": 57440 + }, + { + "epoch": 3.758586849852797, + "grad_norm": 1.0739880800247192, + "learning_rate": 8.146673342906552e-05, + "loss": 0.0676, + "step": 57450 + }, + { + "epoch": 3.7592410860320573, + "grad_norm": 0.9529927968978882, + "learning_rate": 8.145959418543057e-05, + "loss": 0.0645, + "step": 57460 + }, + { + "epoch": 3.7598953222113183, + "grad_norm": 0.8140749931335449, + "learning_rate": 8.14524538799518e-05, + "loss": 0.0721, + "step": 57470 + }, + { + "epoch": 3.7605495583905793, + "grad_norm": 0.7363076210021973, + "learning_rate": 8.144531251287024e-05, + "loss": 0.0655, + "step": 57480 + }, + { + "epoch": 3.7612037945698398, + "grad_norm": 0.9342272281646729, + "learning_rate": 8.14381700844269e-05, + "loss": 0.0688, + "step": 57490 + }, + { + "epoch": 3.7618580307491003, + "grad_norm": 0.8387247920036316, + "learning_rate": 8.143102659486287e-05, + "loss": 0.0605, + "step": 57500 + }, + { + "epoch": 3.7625122669283613, + "grad_norm": 1.0524139404296875, + "learning_rate": 8.142388204441927e-05, + "loss": 0.0688, + "step": 57510 + }, + { + "epoch": 3.763166503107622, + "grad_norm": 1.1220943927764893, + "learning_rate": 8.141673643333723e-05, + "loss": 0.064, + "step": 57520 + }, + { + "epoch": 3.7638207392868823, + "grad_norm": 0.7932508587837219, + "learning_rate": 8.140958976185794e-05, + "loss": 0.0703, + "step": 57530 + }, + { + "epoch": 3.7644749754661433, + "grad_norm": 0.8882383704185486, + "learning_rate": 8.140244203022262e-05, + "loss": 0.0643, + "step": 57540 + }, + { + "epoch": 3.7651292116454043, + "grad_norm": 0.8746048212051392, + "learning_rate": 8.139529323867254e-05, + "loss": 0.0687, + "step": 57550 + }, + { + "epoch": 3.765783447824665, + "grad_norm": 1.058364987373352, + "learning_rate": 8.138814338744896e-05, + "loss": 0.0711, + "step": 57560 + }, + { + "epoch": 3.7664376840039253, + "grad_norm": 0.7067134976387024, + "learning_rate": 8.138099247679322e-05, + "loss": 0.0648, + "step": 57570 + }, + { + "epoch": 3.7670919201831863, + "grad_norm": 0.7847724556922913, + "learning_rate": 8.13738405069467e-05, + "loss": 0.065, + "step": 57580 + }, + { + "epoch": 3.767746156362447, + "grad_norm": 0.9245442748069763, + "learning_rate": 8.136668747815078e-05, + "loss": 0.0655, + "step": 57590 + }, + { + "epoch": 3.7684003925417073, + "grad_norm": 0.940708339214325, + "learning_rate": 8.135953339064688e-05, + "loss": 0.0632, + "step": 57600 + }, + { + "epoch": 3.7690546287209683, + "grad_norm": 1.0202513933181763, + "learning_rate": 8.13523782446765e-05, + "loss": 0.0688, + "step": 57610 + }, + { + "epoch": 3.769708864900229, + "grad_norm": 1.0422993898391724, + "learning_rate": 8.134522204048112e-05, + "loss": 0.0748, + "step": 57620 + }, + { + "epoch": 3.77036310107949, + "grad_norm": 0.8074596524238586, + "learning_rate": 8.13380647783023e-05, + "loss": 0.0715, + "step": 57630 + }, + { + "epoch": 3.7710173372587503, + "grad_norm": 0.8613049983978271, + "learning_rate": 8.13309064583816e-05, + "loss": 0.0843, + "step": 57640 + }, + { + "epoch": 3.7716715734380113, + "grad_norm": 0.8982073664665222, + "learning_rate": 8.132374708096065e-05, + "loss": 0.0653, + "step": 57650 + }, + { + "epoch": 3.772325809617272, + "grad_norm": 0.8297070860862732, + "learning_rate": 8.131658664628107e-05, + "loss": 0.0653, + "step": 57660 + }, + { + "epoch": 3.7729800457965323, + "grad_norm": 0.9720892906188965, + "learning_rate": 8.130942515458456e-05, + "loss": 0.0639, + "step": 57670 + }, + { + "epoch": 3.7736342819757933, + "grad_norm": 0.9780619144439697, + "learning_rate": 8.130226260611284e-05, + "loss": 0.0744, + "step": 57680 + }, + { + "epoch": 3.774288518155054, + "grad_norm": 0.7452018857002258, + "learning_rate": 8.129509900110767e-05, + "loss": 0.0618, + "step": 57690 + }, + { + "epoch": 3.774942754334315, + "grad_norm": 0.9250910878181458, + "learning_rate": 8.128793433981084e-05, + "loss": 0.0693, + "step": 57700 + }, + { + "epoch": 3.7755969905135753, + "grad_norm": 1.0864530801773071, + "learning_rate": 8.128076862246416e-05, + "loss": 0.0729, + "step": 57710 + }, + { + "epoch": 3.7762512266928363, + "grad_norm": 0.7634531259536743, + "learning_rate": 8.127360184930952e-05, + "loss": 0.0699, + "step": 57720 + }, + { + "epoch": 3.776905462872097, + "grad_norm": 0.8827683925628662, + "learning_rate": 8.126643402058877e-05, + "loss": 0.0723, + "step": 57730 + }, + { + "epoch": 3.7775596990513574, + "grad_norm": 0.6894218325614929, + "learning_rate": 8.12592651365439e-05, + "loss": 0.0621, + "step": 57740 + }, + { + "epoch": 3.7782139352306183, + "grad_norm": 0.8839403986930847, + "learning_rate": 8.125209519741683e-05, + "loss": 0.0728, + "step": 57750 + }, + { + "epoch": 3.778868171409879, + "grad_norm": 0.7568190097808838, + "learning_rate": 8.124492420344961e-05, + "loss": 0.0659, + "step": 57760 + }, + { + "epoch": 3.77952240758914, + "grad_norm": 0.9448241591453552, + "learning_rate": 8.123775215488423e-05, + "loss": 0.0676, + "step": 57770 + }, + { + "epoch": 3.7801766437684003, + "grad_norm": 0.7295737862586975, + "learning_rate": 8.123057905196281e-05, + "loss": 0.0656, + "step": 57780 + }, + { + "epoch": 3.7808308799476613, + "grad_norm": 1.049312710762024, + "learning_rate": 8.122340489492743e-05, + "loss": 0.0662, + "step": 57790 + }, + { + "epoch": 3.781485116126922, + "grad_norm": 0.6659790277481079, + "learning_rate": 8.121622968402025e-05, + "loss": 0.063, + "step": 57800 + }, + { + "epoch": 3.7821393523061824, + "grad_norm": 1.10063636302948, + "learning_rate": 8.120905341948345e-05, + "loss": 0.0679, + "step": 57810 + }, + { + "epoch": 3.7827935884854433, + "grad_norm": 1.0684069395065308, + "learning_rate": 8.120187610155924e-05, + "loss": 0.0664, + "step": 57820 + }, + { + "epoch": 3.783447824664704, + "grad_norm": 0.7764497995376587, + "learning_rate": 8.119469773048992e-05, + "loss": 0.08, + "step": 57830 + }, + { + "epoch": 3.784102060843965, + "grad_norm": 0.8615806698799133, + "learning_rate": 8.11875183065177e-05, + "loss": 0.0738, + "step": 57840 + }, + { + "epoch": 3.7847562970232254, + "grad_norm": 0.8051279783248901, + "learning_rate": 8.118033782988496e-05, + "loss": 0.0642, + "step": 57850 + }, + { + "epoch": 3.7854105332024863, + "grad_norm": 0.7473447322845459, + "learning_rate": 8.117315630083404e-05, + "loss": 0.0814, + "step": 57860 + }, + { + "epoch": 3.786064769381747, + "grad_norm": 0.7426646947860718, + "learning_rate": 8.116597371960734e-05, + "loss": 0.0726, + "step": 57870 + }, + { + "epoch": 3.7867190055610074, + "grad_norm": 0.7152010798454285, + "learning_rate": 8.115879008644729e-05, + "loss": 0.0693, + "step": 57880 + }, + { + "epoch": 3.7873732417402683, + "grad_norm": 1.007025957107544, + "learning_rate": 8.115160540159636e-05, + "loss": 0.0714, + "step": 57890 + }, + { + "epoch": 3.788027477919529, + "grad_norm": 0.8072939515113831, + "learning_rate": 8.114441966529707e-05, + "loss": 0.0619, + "step": 57900 + }, + { + "epoch": 3.7886817140987894, + "grad_norm": 0.8661327362060547, + "learning_rate": 8.11372328777919e-05, + "loss": 0.0662, + "step": 57910 + }, + { + "epoch": 3.7893359502780504, + "grad_norm": 0.8400626182556152, + "learning_rate": 8.113004503932348e-05, + "loss": 0.0643, + "step": 57920 + }, + { + "epoch": 3.7899901864573113, + "grad_norm": 0.8952879309654236, + "learning_rate": 8.11228561501344e-05, + "loss": 0.0638, + "step": 57930 + }, + { + "epoch": 3.790644422636572, + "grad_norm": 1.0547457933425903, + "learning_rate": 8.11156662104673e-05, + "loss": 0.0639, + "step": 57940 + }, + { + "epoch": 3.7912986588158324, + "grad_norm": 1.0903702974319458, + "learning_rate": 8.110847522056485e-05, + "loss": 0.0619, + "step": 57950 + }, + { + "epoch": 3.7919528949950934, + "grad_norm": 0.9032535552978516, + "learning_rate": 8.11012831806698e-05, + "loss": 0.0725, + "step": 57960 + }, + { + "epoch": 3.792607131174354, + "grad_norm": 1.0580861568450928, + "learning_rate": 8.109409009102486e-05, + "loss": 0.0721, + "step": 57970 + }, + { + "epoch": 3.7932613673536144, + "grad_norm": 1.0803550481796265, + "learning_rate": 8.108689595187285e-05, + "loss": 0.0703, + "step": 57980 + }, + { + "epoch": 3.7939156035328754, + "grad_norm": 0.8773861527442932, + "learning_rate": 8.107970076345654e-05, + "loss": 0.067, + "step": 57990 + }, + { + "epoch": 3.7945698397121363, + "grad_norm": 0.7425010204315186, + "learning_rate": 8.107250452601885e-05, + "loss": 0.0721, + "step": 58000 + }, + { + "epoch": 3.795224075891397, + "grad_norm": 0.7278298139572144, + "learning_rate": 8.106530723980261e-05, + "loss": 0.0675, + "step": 58010 + }, + { + "epoch": 3.7958783120706574, + "grad_norm": 0.9992676973342896, + "learning_rate": 8.105810890505081e-05, + "loss": 0.0685, + "step": 58020 + }, + { + "epoch": 3.7965325482499184, + "grad_norm": 0.8017996549606323, + "learning_rate": 8.105090952200637e-05, + "loss": 0.0699, + "step": 58030 + }, + { + "epoch": 3.797186784429179, + "grad_norm": 1.016209363937378, + "learning_rate": 8.10437090909123e-05, + "loss": 0.0703, + "step": 58040 + }, + { + "epoch": 3.7978410206084394, + "grad_norm": 0.8442249894142151, + "learning_rate": 8.103650761201163e-05, + "loss": 0.0719, + "step": 58050 + }, + { + "epoch": 3.7984952567877004, + "grad_norm": 0.8751046061515808, + "learning_rate": 8.102930508554744e-05, + "loss": 0.07, + "step": 58060 + }, + { + "epoch": 3.799149492966961, + "grad_norm": 0.6956860423088074, + "learning_rate": 8.102210151176282e-05, + "loss": 0.0631, + "step": 58070 + }, + { + "epoch": 3.799803729146222, + "grad_norm": 0.9162025451660156, + "learning_rate": 8.101489689090091e-05, + "loss": 0.0699, + "step": 58080 + }, + { + "epoch": 3.8004579653254824, + "grad_norm": 0.9595727920532227, + "learning_rate": 8.10076912232049e-05, + "loss": 0.0604, + "step": 58090 + }, + { + "epoch": 3.8011122015047434, + "grad_norm": 0.9500911235809326, + "learning_rate": 8.100048450891799e-05, + "loss": 0.0625, + "step": 58100 + }, + { + "epoch": 3.801766437684004, + "grad_norm": 0.6980655193328857, + "learning_rate": 8.099327674828342e-05, + "loss": 0.0588, + "step": 58110 + }, + { + "epoch": 3.8024206738632644, + "grad_norm": 0.9078855514526367, + "learning_rate": 8.098606794154448e-05, + "loss": 0.0599, + "step": 58120 + }, + { + "epoch": 3.8030749100425254, + "grad_norm": 0.8954933881759644, + "learning_rate": 8.097885808894449e-05, + "loss": 0.0646, + "step": 58130 + }, + { + "epoch": 3.803729146221786, + "grad_norm": 0.8276561498641968, + "learning_rate": 8.097164719072679e-05, + "loss": 0.0717, + "step": 58140 + }, + { + "epoch": 3.804383382401047, + "grad_norm": 0.8717600703239441, + "learning_rate": 8.096443524713477e-05, + "loss": 0.069, + "step": 58150 + }, + { + "epoch": 3.8050376185803074, + "grad_norm": 0.9472079277038574, + "learning_rate": 8.095722225841185e-05, + "loss": 0.0702, + "step": 58160 + }, + { + "epoch": 3.8056918547595684, + "grad_norm": 0.8693017959594727, + "learning_rate": 8.09500082248015e-05, + "loss": 0.0697, + "step": 58170 + }, + { + "epoch": 3.806346090938829, + "grad_norm": 1.0408918857574463, + "learning_rate": 8.094279314654718e-05, + "loss": 0.0752, + "step": 58180 + }, + { + "epoch": 3.8070003271180894, + "grad_norm": 0.827529788017273, + "learning_rate": 8.093557702389246e-05, + "loss": 0.0712, + "step": 58190 + }, + { + "epoch": 3.8076545632973504, + "grad_norm": 0.6943844556808472, + "learning_rate": 8.092835985708088e-05, + "loss": 0.0648, + "step": 58200 + }, + { + "epoch": 3.808308799476611, + "grad_norm": 0.8800909519195557, + "learning_rate": 8.092114164635604e-05, + "loss": 0.0684, + "step": 58210 + }, + { + "epoch": 3.808963035655872, + "grad_norm": 0.7584820985794067, + "learning_rate": 8.091392239196159e-05, + "loss": 0.072, + "step": 58220 + }, + { + "epoch": 3.8096172718351324, + "grad_norm": 0.9473629593849182, + "learning_rate": 8.090670209414117e-05, + "loss": 0.0664, + "step": 58230 + }, + { + "epoch": 3.8102715080143934, + "grad_norm": 1.0267720222473145, + "learning_rate": 8.08994807531385e-05, + "loss": 0.0638, + "step": 58240 + }, + { + "epoch": 3.810925744193654, + "grad_norm": 0.7905268669128418, + "learning_rate": 8.089225836919732e-05, + "loss": 0.0636, + "step": 58250 + }, + { + "epoch": 3.8115799803729145, + "grad_norm": 0.8722211718559265, + "learning_rate": 8.08850349425614e-05, + "loss": 0.0767, + "step": 58260 + }, + { + "epoch": 3.8122342165521754, + "grad_norm": 0.8511031866073608, + "learning_rate": 8.087781047347455e-05, + "loss": 0.0705, + "step": 58270 + }, + { + "epoch": 3.812888452731436, + "grad_norm": 0.8996208310127258, + "learning_rate": 8.087058496218063e-05, + "loss": 0.0645, + "step": 58280 + }, + { + "epoch": 3.813542688910697, + "grad_norm": 0.975518524646759, + "learning_rate": 8.08633584089235e-05, + "loss": 0.0808, + "step": 58290 + }, + { + "epoch": 3.8141969250899574, + "grad_norm": 0.9929527044296265, + "learning_rate": 8.085613081394708e-05, + "loss": 0.0718, + "step": 58300 + }, + { + "epoch": 3.8148511612692184, + "grad_norm": 0.8056535124778748, + "learning_rate": 8.084890217749532e-05, + "loss": 0.0671, + "step": 58310 + }, + { + "epoch": 3.815505397448479, + "grad_norm": 0.766545832157135, + "learning_rate": 8.084167249981219e-05, + "loss": 0.0711, + "step": 58320 + }, + { + "epoch": 3.8161596336277395, + "grad_norm": 1.0355461835861206, + "learning_rate": 8.083444178114174e-05, + "loss": 0.0759, + "step": 58330 + }, + { + "epoch": 3.8168138698070004, + "grad_norm": 0.8508825898170471, + "learning_rate": 8.082721002172801e-05, + "loss": 0.0696, + "step": 58340 + }, + { + "epoch": 3.817468105986261, + "grad_norm": 0.863624095916748, + "learning_rate": 8.081997722181512e-05, + "loss": 0.073, + "step": 58350 + }, + { + "epoch": 3.8181223421655215, + "grad_norm": 0.8428578972816467, + "learning_rate": 8.081274338164714e-05, + "loss": 0.0697, + "step": 58360 + }, + { + "epoch": 3.8187765783447825, + "grad_norm": 0.8792449235916138, + "learning_rate": 8.080550850146829e-05, + "loss": 0.0665, + "step": 58370 + }, + { + "epoch": 3.8194308145240434, + "grad_norm": 0.7911483645439148, + "learning_rate": 8.079827258152272e-05, + "loss": 0.0738, + "step": 58380 + }, + { + "epoch": 3.820085050703304, + "grad_norm": 0.9824073314666748, + "learning_rate": 8.079103562205468e-05, + "loss": 0.0693, + "step": 58390 + }, + { + "epoch": 3.8207392868825645, + "grad_norm": 0.77364581823349, + "learning_rate": 8.078379762330843e-05, + "loss": 0.0666, + "step": 58400 + }, + { + "epoch": 3.8213935230618254, + "grad_norm": 0.8866234421730042, + "learning_rate": 8.07765585855283e-05, + "loss": 0.0706, + "step": 58410 + }, + { + "epoch": 3.822047759241086, + "grad_norm": 0.7510147094726562, + "learning_rate": 8.076931850895859e-05, + "loss": 0.0679, + "step": 58420 + }, + { + "epoch": 3.8227019954203465, + "grad_norm": 0.9007807374000549, + "learning_rate": 8.076207739384368e-05, + "loss": 0.073, + "step": 58430 + }, + { + "epoch": 3.8233562315996075, + "grad_norm": 0.8261735439300537, + "learning_rate": 8.075483524042797e-05, + "loss": 0.0709, + "step": 58440 + }, + { + "epoch": 3.8240104677788684, + "grad_norm": 0.7386045455932617, + "learning_rate": 8.074759204895593e-05, + "loss": 0.0606, + "step": 58450 + }, + { + "epoch": 3.824664703958129, + "grad_norm": 1.0054618120193481, + "learning_rate": 8.0740347819672e-05, + "loss": 0.0652, + "step": 58460 + }, + { + "epoch": 3.8253189401373895, + "grad_norm": 0.8960070610046387, + "learning_rate": 8.073310255282074e-05, + "loss": 0.0707, + "step": 58470 + }, + { + "epoch": 3.8259731763166505, + "grad_norm": 0.8370187282562256, + "learning_rate": 8.072585624864666e-05, + "loss": 0.0668, + "step": 58480 + }, + { + "epoch": 3.826627412495911, + "grad_norm": 0.9938347935676575, + "learning_rate": 8.071860890739435e-05, + "loss": 0.0786, + "step": 58490 + }, + { + "epoch": 3.8272816486751715, + "grad_norm": 1.007035732269287, + "learning_rate": 8.071136052930843e-05, + "loss": 0.0697, + "step": 58500 + }, + { + "epoch": 3.8279358848544325, + "grad_norm": 0.86492520570755, + "learning_rate": 8.070411111463353e-05, + "loss": 0.0749, + "step": 58510 + }, + { + "epoch": 3.828590121033693, + "grad_norm": 0.7350853085517883, + "learning_rate": 8.069686066361437e-05, + "loss": 0.0648, + "step": 58520 + }, + { + "epoch": 3.829244357212954, + "grad_norm": 1.0213762521743774, + "learning_rate": 8.068960917649566e-05, + "loss": 0.0731, + "step": 58530 + }, + { + "epoch": 3.8298985933922145, + "grad_norm": 0.8180403113365173, + "learning_rate": 8.068235665352214e-05, + "loss": 0.0651, + "step": 58540 + }, + { + "epoch": 3.8305528295714755, + "grad_norm": 0.8237452507019043, + "learning_rate": 8.067510309493861e-05, + "loss": 0.0702, + "step": 58550 + }, + { + "epoch": 3.831207065750736, + "grad_norm": 0.7584102153778076, + "learning_rate": 8.066784850098992e-05, + "loss": 0.062, + "step": 58560 + }, + { + "epoch": 3.8318613019299965, + "grad_norm": 1.0070369243621826, + "learning_rate": 8.066059287192091e-05, + "loss": 0.0657, + "step": 58570 + }, + { + "epoch": 3.8325155381092575, + "grad_norm": 1.0255883932113647, + "learning_rate": 8.065333620797649e-05, + "loss": 0.0699, + "step": 58580 + }, + { + "epoch": 3.833169774288518, + "grad_norm": 0.8956732153892517, + "learning_rate": 8.064607850940156e-05, + "loss": 0.0733, + "step": 58590 + }, + { + "epoch": 3.833824010467779, + "grad_norm": 0.8609063625335693, + "learning_rate": 8.063881977644112e-05, + "loss": 0.0692, + "step": 58600 + }, + { + "epoch": 3.8344782466470395, + "grad_norm": 1.1151846647262573, + "learning_rate": 8.063156000934016e-05, + "loss": 0.0674, + "step": 58610 + }, + { + "epoch": 3.8351324828263005, + "grad_norm": 0.8186620473861694, + "learning_rate": 8.062429920834372e-05, + "loss": 0.0711, + "step": 58620 + }, + { + "epoch": 3.835786719005561, + "grad_norm": 0.8908758759498596, + "learning_rate": 8.061703737369686e-05, + "loss": 0.0635, + "step": 58630 + }, + { + "epoch": 3.8364409551848215, + "grad_norm": 1.0307601690292358, + "learning_rate": 8.060977450564469e-05, + "loss": 0.0708, + "step": 58640 + }, + { + "epoch": 3.8370951913640825, + "grad_norm": 0.8627874851226807, + "learning_rate": 8.060251060443236e-05, + "loss": 0.0646, + "step": 58650 + }, + { + "epoch": 3.837749427543343, + "grad_norm": 0.9624312520027161, + "learning_rate": 8.059524567030503e-05, + "loss": 0.0685, + "step": 58660 + }, + { + "epoch": 3.838403663722604, + "grad_norm": 0.8009755611419678, + "learning_rate": 8.058797970350793e-05, + "loss": 0.064, + "step": 58670 + }, + { + "epoch": 3.8390578999018645, + "grad_norm": 0.9126613140106201, + "learning_rate": 8.058071270428628e-05, + "loss": 0.0725, + "step": 58680 + }, + { + "epoch": 3.8397121360811255, + "grad_norm": 0.7582046389579773, + "learning_rate": 8.057344467288539e-05, + "loss": 0.0622, + "step": 58690 + }, + { + "epoch": 3.840366372260386, + "grad_norm": 0.7236906886100769, + "learning_rate": 8.056617560955056e-05, + "loss": 0.0658, + "step": 58700 + }, + { + "epoch": 3.8410206084396465, + "grad_norm": 0.7379127144813538, + "learning_rate": 8.055890551452714e-05, + "loss": 0.0655, + "step": 58710 + }, + { + "epoch": 3.8416748446189075, + "grad_norm": 0.8637368679046631, + "learning_rate": 8.055163438806051e-05, + "loss": 0.0803, + "step": 58720 + }, + { + "epoch": 3.842329080798168, + "grad_norm": 0.9424805045127869, + "learning_rate": 8.05443622303961e-05, + "loss": 0.0723, + "step": 58730 + }, + { + "epoch": 3.842983316977429, + "grad_norm": 0.9135764241218567, + "learning_rate": 8.053708904177934e-05, + "loss": 0.0717, + "step": 58740 + }, + { + "epoch": 3.8436375531566895, + "grad_norm": 0.7317261695861816, + "learning_rate": 8.052981482245577e-05, + "loss": 0.0679, + "step": 58750 + }, + { + "epoch": 3.8442917893359505, + "grad_norm": 1.1053870916366577, + "learning_rate": 8.052253957267086e-05, + "loss": 0.0699, + "step": 58760 + }, + { + "epoch": 3.844946025515211, + "grad_norm": 1.033487319946289, + "learning_rate": 8.05152632926702e-05, + "loss": 0.0776, + "step": 58770 + }, + { + "epoch": 3.8456002616944716, + "grad_norm": 1.0361863374710083, + "learning_rate": 8.050798598269937e-05, + "loss": 0.0778, + "step": 58780 + }, + { + "epoch": 3.8462544978737325, + "grad_norm": 0.8072547912597656, + "learning_rate": 8.050070764300401e-05, + "loss": 0.0695, + "step": 58790 + }, + { + "epoch": 3.846908734052993, + "grad_norm": 0.7639461159706116, + "learning_rate": 8.049342827382977e-05, + "loss": 0.0718, + "step": 58800 + }, + { + "epoch": 3.8475629702322536, + "grad_norm": 0.9850027561187744, + "learning_rate": 8.048614787542234e-05, + "loss": 0.0647, + "step": 58810 + }, + { + "epoch": 3.8482172064115145, + "grad_norm": 0.8612027168273926, + "learning_rate": 8.047886644802749e-05, + "loss": 0.0667, + "step": 58820 + }, + { + "epoch": 3.8488714425907755, + "grad_norm": 0.7390583157539368, + "learning_rate": 8.047158399189096e-05, + "loss": 0.0606, + "step": 58830 + }, + { + "epoch": 3.849525678770036, + "grad_norm": 0.8510991930961609, + "learning_rate": 8.046430050725854e-05, + "loss": 0.0688, + "step": 58840 + }, + { + "epoch": 3.8501799149492966, + "grad_norm": 0.8368331789970398, + "learning_rate": 8.045701599437609e-05, + "loss": 0.0683, + "step": 58850 + }, + { + "epoch": 3.8508341511285575, + "grad_norm": 1.0364656448364258, + "learning_rate": 8.044973045348949e-05, + "loss": 0.0755, + "step": 58860 + }, + { + "epoch": 3.851488387307818, + "grad_norm": 0.7408995628356934, + "learning_rate": 8.044244388484463e-05, + "loss": 0.0695, + "step": 58870 + }, + { + "epoch": 3.8521426234870786, + "grad_norm": 0.7264641523361206, + "learning_rate": 8.043515628868743e-05, + "loss": 0.0771, + "step": 58880 + }, + { + "epoch": 3.8527968596663396, + "grad_norm": 0.8603324294090271, + "learning_rate": 8.042786766526389e-05, + "loss": 0.0619, + "step": 58890 + }, + { + "epoch": 3.8534510958456005, + "grad_norm": 1.0376695394515991, + "learning_rate": 8.042057801482001e-05, + "loss": 0.0681, + "step": 58900 + }, + { + "epoch": 3.854105332024861, + "grad_norm": 0.7530226111412048, + "learning_rate": 8.041328733760185e-05, + "loss": 0.0709, + "step": 58910 + }, + { + "epoch": 3.8547595682041216, + "grad_norm": 0.8410870432853699, + "learning_rate": 8.040599563385548e-05, + "loss": 0.0628, + "step": 58920 + }, + { + "epoch": 3.8554138043833825, + "grad_norm": 1.0103882551193237, + "learning_rate": 8.039870290382703e-05, + "loss": 0.0743, + "step": 58930 + }, + { + "epoch": 3.856068040562643, + "grad_norm": 0.925134539604187, + "learning_rate": 8.039140914776262e-05, + "loss": 0.0767, + "step": 58940 + }, + { + "epoch": 3.8567222767419036, + "grad_norm": 0.9100943803787231, + "learning_rate": 8.038411436590845e-05, + "loss": 0.0697, + "step": 58950 + }, + { + "epoch": 3.8573765129211646, + "grad_norm": 0.9153822660446167, + "learning_rate": 8.037681855851072e-05, + "loss": 0.0622, + "step": 58960 + }, + { + "epoch": 3.858030749100425, + "grad_norm": 0.9018275737762451, + "learning_rate": 8.036952172581571e-05, + "loss": 0.0669, + "step": 58970 + }, + { + "epoch": 3.858684985279686, + "grad_norm": 0.9553847908973694, + "learning_rate": 8.03622238680697e-05, + "loss": 0.0738, + "step": 58980 + }, + { + "epoch": 3.8593392214589466, + "grad_norm": 0.8879307508468628, + "learning_rate": 8.0354924985519e-05, + "loss": 0.0707, + "step": 58990 + }, + { + "epoch": 3.8599934576382076, + "grad_norm": 0.8077122569084167, + "learning_rate": 8.034762507840997e-05, + "loss": 0.0819, + "step": 59000 + }, + { + "epoch": 3.860647693817468, + "grad_norm": 0.7566487193107605, + "learning_rate": 8.034032414698901e-05, + "loss": 0.0752, + "step": 59010 + }, + { + "epoch": 3.8613019299967286, + "grad_norm": 0.8122360706329346, + "learning_rate": 8.033302219150253e-05, + "loss": 0.0715, + "step": 59020 + }, + { + "epoch": 3.8619561661759896, + "grad_norm": 1.0947107076644897, + "learning_rate": 8.0325719212197e-05, + "loss": 0.0641, + "step": 59030 + }, + { + "epoch": 3.86261040235525, + "grad_norm": 0.782038688659668, + "learning_rate": 8.031841520931893e-05, + "loss": 0.0608, + "step": 59040 + }, + { + "epoch": 3.863264638534511, + "grad_norm": 0.7737072706222534, + "learning_rate": 8.031111018311483e-05, + "loss": 0.0664, + "step": 59050 + }, + { + "epoch": 3.8639188747137716, + "grad_norm": 0.8657538890838623, + "learning_rate": 8.030380413383125e-05, + "loss": 0.0669, + "step": 59060 + }, + { + "epoch": 3.8645731108930326, + "grad_norm": 0.8043084144592285, + "learning_rate": 8.029649706171483e-05, + "loss": 0.0719, + "step": 59070 + }, + { + "epoch": 3.865227347072293, + "grad_norm": 0.7678789496421814, + "learning_rate": 8.028918896701217e-05, + "loss": 0.0679, + "step": 59080 + }, + { + "epoch": 3.8658815832515536, + "grad_norm": 0.935185432434082, + "learning_rate": 8.028187984996993e-05, + "loss": 0.0701, + "step": 59090 + }, + { + "epoch": 3.8665358194308146, + "grad_norm": 0.9353100061416626, + "learning_rate": 8.027456971083485e-05, + "loss": 0.0636, + "step": 59100 + }, + { + "epoch": 3.867190055610075, + "grad_norm": 0.8760929107666016, + "learning_rate": 8.026725854985363e-05, + "loss": 0.0776, + "step": 59110 + }, + { + "epoch": 3.867844291789336, + "grad_norm": 0.9680326581001282, + "learning_rate": 8.025994636727306e-05, + "loss": 0.0749, + "step": 59120 + }, + { + "epoch": 3.8684985279685966, + "grad_norm": 0.6920056343078613, + "learning_rate": 8.025263316333994e-05, + "loss": 0.0715, + "step": 59130 + }, + { + "epoch": 3.8691527641478576, + "grad_norm": 0.9525841474533081, + "learning_rate": 8.024531893830112e-05, + "loss": 0.0673, + "step": 59140 + }, + { + "epoch": 3.869807000327118, + "grad_norm": 0.8676741719245911, + "learning_rate": 8.023800369240344e-05, + "loss": 0.0744, + "step": 59150 + }, + { + "epoch": 3.8704612365063786, + "grad_norm": 0.8752907514572144, + "learning_rate": 8.023068742589386e-05, + "loss": 0.0646, + "step": 59160 + }, + { + "epoch": 3.8711154726856396, + "grad_norm": 0.9634379744529724, + "learning_rate": 8.022337013901928e-05, + "loss": 0.0594, + "step": 59170 + }, + { + "epoch": 3.8717697088649, + "grad_norm": 0.7896409034729004, + "learning_rate": 8.02160518320267e-05, + "loss": 0.072, + "step": 59180 + }, + { + "epoch": 3.872423945044161, + "grad_norm": 0.8738341927528381, + "learning_rate": 8.020873250516312e-05, + "loss": 0.0677, + "step": 59190 + }, + { + "epoch": 3.8730781812234216, + "grad_norm": 0.8021124601364136, + "learning_rate": 8.02014121586756e-05, + "loss": 0.0841, + "step": 59200 + }, + { + "epoch": 3.8737324174026826, + "grad_norm": 0.9408907890319824, + "learning_rate": 8.019409079281122e-05, + "loss": 0.0625, + "step": 59210 + }, + { + "epoch": 3.874386653581943, + "grad_norm": 0.8680176138877869, + "learning_rate": 8.018676840781707e-05, + "loss": 0.0702, + "step": 59220 + }, + { + "epoch": 3.8750408897612036, + "grad_norm": 0.6961979269981384, + "learning_rate": 8.017944500394033e-05, + "loss": 0.0578, + "step": 59230 + }, + { + "epoch": 3.8756951259404646, + "grad_norm": 0.8254890441894531, + "learning_rate": 8.017212058142817e-05, + "loss": 0.0717, + "step": 59240 + }, + { + "epoch": 3.876349362119725, + "grad_norm": 1.0662809610366821, + "learning_rate": 8.016479514052783e-05, + "loss": 0.0675, + "step": 59250 + }, + { + "epoch": 3.8770035982989857, + "grad_norm": 0.8766838908195496, + "learning_rate": 8.015746868148651e-05, + "loss": 0.0688, + "step": 59260 + }, + { + "epoch": 3.8776578344782466, + "grad_norm": 0.8927856683731079, + "learning_rate": 8.015014120455156e-05, + "loss": 0.0628, + "step": 59270 + }, + { + "epoch": 3.8783120706575076, + "grad_norm": 0.8246244192123413, + "learning_rate": 8.014281270997026e-05, + "loss": 0.0691, + "step": 59280 + }, + { + "epoch": 3.878966306836768, + "grad_norm": 0.9650526642799377, + "learning_rate": 8.013548319798998e-05, + "loss": 0.0771, + "step": 59290 + }, + { + "epoch": 3.8796205430160287, + "grad_norm": 0.6766818165779114, + "learning_rate": 8.012815266885811e-05, + "loss": 0.0567, + "step": 59300 + }, + { + "epoch": 3.8802747791952896, + "grad_norm": 0.8025507926940918, + "learning_rate": 8.012082112282207e-05, + "loss": 0.0614, + "step": 59310 + }, + { + "epoch": 3.88092901537455, + "grad_norm": 0.8738349080085754, + "learning_rate": 8.011348856012932e-05, + "loss": 0.0754, + "step": 59320 + }, + { + "epoch": 3.8815832515538107, + "grad_norm": 0.819463312625885, + "learning_rate": 8.010615498102736e-05, + "loss": 0.0634, + "step": 59330 + }, + { + "epoch": 3.8822374877330716, + "grad_norm": 0.7592009902000427, + "learning_rate": 8.009882038576371e-05, + "loss": 0.0649, + "step": 59340 + }, + { + "epoch": 3.8828917239123326, + "grad_norm": 1.0910615921020508, + "learning_rate": 8.009148477458594e-05, + "loss": 0.0777, + "step": 59350 + }, + { + "epoch": 3.883545960091593, + "grad_norm": 0.8218048214912415, + "learning_rate": 8.008414814774163e-05, + "loss": 0.064, + "step": 59360 + }, + { + "epoch": 3.8842001962708537, + "grad_norm": 0.8546844720840454, + "learning_rate": 8.007681050547844e-05, + "loss": 0.0787, + "step": 59370 + }, + { + "epoch": 3.8848544324501146, + "grad_norm": 0.8277244567871094, + "learning_rate": 8.0069471848044e-05, + "loss": 0.0666, + "step": 59380 + }, + { + "epoch": 3.885508668629375, + "grad_norm": 0.7746628522872925, + "learning_rate": 8.006213217568604e-05, + "loss": 0.0641, + "step": 59390 + }, + { + "epoch": 3.8861629048086357, + "grad_norm": 1.0281668901443481, + "learning_rate": 8.005479148865226e-05, + "loss": 0.0729, + "step": 59400 + }, + { + "epoch": 3.8868171409878967, + "grad_norm": 0.7792064547538757, + "learning_rate": 8.004744978719046e-05, + "loss": 0.0722, + "step": 59410 + }, + { + "epoch": 3.887471377167157, + "grad_norm": 0.7201108336448669, + "learning_rate": 8.004010707154843e-05, + "loss": 0.0694, + "step": 59420 + }, + { + "epoch": 3.888125613346418, + "grad_norm": 0.8858181834220886, + "learning_rate": 8.003276334197399e-05, + "loss": 0.0624, + "step": 59430 + }, + { + "epoch": 3.8887798495256787, + "grad_norm": 0.8069655895233154, + "learning_rate": 8.002541859871502e-05, + "loss": 0.0636, + "step": 59440 + }, + { + "epoch": 3.8894340857049396, + "grad_norm": 0.812025249004364, + "learning_rate": 8.001807284201944e-05, + "loss": 0.0638, + "step": 59450 + }, + { + "epoch": 3.8900883218842, + "grad_norm": 0.8210043907165527, + "learning_rate": 8.001072607213518e-05, + "loss": 0.0755, + "step": 59460 + }, + { + "epoch": 3.8907425580634607, + "grad_norm": 0.8998602032661438, + "learning_rate": 8.000337828931021e-05, + "loss": 0.0702, + "step": 59470 + }, + { + "epoch": 3.8913967942427217, + "grad_norm": 0.7417557239532471, + "learning_rate": 7.999602949379252e-05, + "loss": 0.0666, + "step": 59480 + }, + { + "epoch": 3.892051030421982, + "grad_norm": 0.778843104839325, + "learning_rate": 7.998867968583018e-05, + "loss": 0.0689, + "step": 59490 + }, + { + "epoch": 3.892705266601243, + "grad_norm": 0.7285590767860413, + "learning_rate": 7.998132886567125e-05, + "loss": 0.0667, + "step": 59500 + }, + { + "epoch": 3.8933595027805037, + "grad_norm": 0.8084492683410645, + "learning_rate": 7.997397703356384e-05, + "loss": 0.0588, + "step": 59510 + }, + { + "epoch": 3.8940137389597647, + "grad_norm": 0.8009549379348755, + "learning_rate": 7.996662418975609e-05, + "loss": 0.0634, + "step": 59520 + }, + { + "epoch": 3.894667975139025, + "grad_norm": 0.8678660988807678, + "learning_rate": 7.99592703344962e-05, + "loss": 0.0692, + "step": 59530 + }, + { + "epoch": 3.8953222113182857, + "grad_norm": 1.06533682346344, + "learning_rate": 7.995191546803235e-05, + "loss": 0.0712, + "step": 59540 + }, + { + "epoch": 3.8959764474975467, + "grad_norm": 1.0511845350265503, + "learning_rate": 7.99445595906128e-05, + "loss": 0.0764, + "step": 59550 + }, + { + "epoch": 3.896630683676807, + "grad_norm": 0.9604825973510742, + "learning_rate": 7.993720270248584e-05, + "loss": 0.0743, + "step": 59560 + }, + { + "epoch": 3.897284919856068, + "grad_norm": 0.9758427739143372, + "learning_rate": 7.992984480389977e-05, + "loss": 0.0586, + "step": 59570 + }, + { + "epoch": 3.8979391560353287, + "grad_norm": 0.862629771232605, + "learning_rate": 7.992248589510293e-05, + "loss": 0.0597, + "step": 59580 + }, + { + "epoch": 3.8985933922145897, + "grad_norm": 0.876379132270813, + "learning_rate": 7.991512597634375e-05, + "loss": 0.0651, + "step": 59590 + }, + { + "epoch": 3.89924762839385, + "grad_norm": 1.1144251823425293, + "learning_rate": 7.990776504787059e-05, + "loss": 0.0755, + "step": 59600 + }, + { + "epoch": 3.8999018645731107, + "grad_norm": 0.9200652837753296, + "learning_rate": 7.990040310993193e-05, + "loss": 0.058, + "step": 59610 + }, + { + "epoch": 3.9005561007523717, + "grad_norm": 0.8671579360961914, + "learning_rate": 7.989304016277625e-05, + "loss": 0.0728, + "step": 59620 + }, + { + "epoch": 3.901210336931632, + "grad_norm": 0.8644618391990662, + "learning_rate": 7.988567620665206e-05, + "loss": 0.0712, + "step": 59630 + }, + { + "epoch": 3.901864573110893, + "grad_norm": 1.062548279762268, + "learning_rate": 7.987831124180792e-05, + "loss": 0.0605, + "step": 59640 + }, + { + "epoch": 3.9025188092901537, + "grad_norm": 0.7029675841331482, + "learning_rate": 7.987094526849242e-05, + "loss": 0.0605, + "step": 59650 + }, + { + "epoch": 3.9031730454694147, + "grad_norm": 0.7304003834724426, + "learning_rate": 7.986357828695419e-05, + "loss": 0.0624, + "step": 59660 + }, + { + "epoch": 3.903827281648675, + "grad_norm": 0.8633514046669006, + "learning_rate": 7.985621029744186e-05, + "loss": 0.074, + "step": 59670 + }, + { + "epoch": 3.9044815178279357, + "grad_norm": 1.0137802362442017, + "learning_rate": 7.984884130020414e-05, + "loss": 0.0597, + "step": 59680 + }, + { + "epoch": 3.9051357540071967, + "grad_norm": 1.002747893333435, + "learning_rate": 7.984147129548973e-05, + "loss": 0.0765, + "step": 59690 + }, + { + "epoch": 3.9057899901864572, + "grad_norm": 0.7557197213172913, + "learning_rate": 7.983410028354741e-05, + "loss": 0.0758, + "step": 59700 + }, + { + "epoch": 3.9064442263657178, + "grad_norm": 1.0572227239608765, + "learning_rate": 7.982672826462595e-05, + "loss": 0.0619, + "step": 59710 + }, + { + "epoch": 3.9070984625449787, + "grad_norm": 0.8649874925613403, + "learning_rate": 7.981935523897421e-05, + "loss": 0.0617, + "step": 59720 + }, + { + "epoch": 3.9077526987242397, + "grad_norm": 0.711691677570343, + "learning_rate": 7.981198120684101e-05, + "loss": 0.066, + "step": 59730 + }, + { + "epoch": 3.9084069349035, + "grad_norm": 0.9565288424491882, + "learning_rate": 7.980460616847527e-05, + "loss": 0.0708, + "step": 59740 + }, + { + "epoch": 3.9090611710827607, + "grad_norm": 0.9531120657920837, + "learning_rate": 7.97972301241259e-05, + "loss": 0.0662, + "step": 59750 + }, + { + "epoch": 3.9097154072620217, + "grad_norm": 0.8800640106201172, + "learning_rate": 7.978985307404187e-05, + "loss": 0.0723, + "step": 59760 + }, + { + "epoch": 3.9103696434412822, + "grad_norm": 0.7907775044441223, + "learning_rate": 7.978247501847216e-05, + "loss": 0.0713, + "step": 59770 + }, + { + "epoch": 3.9110238796205428, + "grad_norm": 0.8069142699241638, + "learning_rate": 7.977509595766583e-05, + "loss": 0.0609, + "step": 59780 + }, + { + "epoch": 3.9116781157998037, + "grad_norm": 0.9594196677207947, + "learning_rate": 7.976771589187193e-05, + "loss": 0.0742, + "step": 59790 + }, + { + "epoch": 3.9123323519790647, + "grad_norm": 0.9948168992996216, + "learning_rate": 7.976033482133953e-05, + "loss": 0.0713, + "step": 59800 + }, + { + "epoch": 3.9129865881583252, + "grad_norm": 1.1436748504638672, + "learning_rate": 7.975295274631777e-05, + "loss": 0.0675, + "step": 59810 + }, + { + "epoch": 3.9136408243375858, + "grad_norm": 0.8440989255905151, + "learning_rate": 7.974556966705584e-05, + "loss": 0.0783, + "step": 59820 + }, + { + "epoch": 3.9142950605168467, + "grad_norm": 0.7184342741966248, + "learning_rate": 7.973818558380294e-05, + "loss": 0.0633, + "step": 59830 + }, + { + "epoch": 3.9149492966961073, + "grad_norm": 0.8069654703140259, + "learning_rate": 7.973080049680825e-05, + "loss": 0.0722, + "step": 59840 + }, + { + "epoch": 3.9156035328753678, + "grad_norm": 1.0971922874450684, + "learning_rate": 7.972341440632109e-05, + "loss": 0.0677, + "step": 59850 + }, + { + "epoch": 3.9162577690546287, + "grad_norm": 0.946134626865387, + "learning_rate": 7.971602731259075e-05, + "loss": 0.0721, + "step": 59860 + }, + { + "epoch": 3.9169120052338893, + "grad_norm": 0.9672790765762329, + "learning_rate": 7.970863921586655e-05, + "loss": 0.0707, + "step": 59870 + }, + { + "epoch": 3.9175662414131502, + "grad_norm": 1.017909288406372, + "learning_rate": 7.970125011639786e-05, + "loss": 0.0661, + "step": 59880 + }, + { + "epoch": 3.9182204775924108, + "grad_norm": 0.9773590564727783, + "learning_rate": 7.969386001443408e-05, + "loss": 0.0664, + "step": 59890 + }, + { + "epoch": 3.9188747137716717, + "grad_norm": 1.026491403579712, + "learning_rate": 7.968646891022466e-05, + "loss": 0.0707, + "step": 59900 + }, + { + "epoch": 3.9195289499509323, + "grad_norm": 0.9397926926612854, + "learning_rate": 7.967907680401904e-05, + "loss": 0.0628, + "step": 59910 + }, + { + "epoch": 3.920183186130193, + "grad_norm": 0.814552366733551, + "learning_rate": 7.967168369606676e-05, + "loss": 0.0702, + "step": 59920 + }, + { + "epoch": 3.9208374223094538, + "grad_norm": 1.042052149772644, + "learning_rate": 7.966428958661734e-05, + "loss": 0.0678, + "step": 59930 + }, + { + "epoch": 3.9214916584887143, + "grad_norm": 0.8458235859870911, + "learning_rate": 7.965689447592035e-05, + "loss": 0.0752, + "step": 59940 + }, + { + "epoch": 3.9221458946679753, + "grad_norm": 0.748357355594635, + "learning_rate": 7.964949836422537e-05, + "loss": 0.0692, + "step": 59950 + }, + { + "epoch": 3.9228001308472358, + "grad_norm": 0.9684262275695801, + "learning_rate": 7.964210125178209e-05, + "loss": 0.0646, + "step": 59960 + }, + { + "epoch": 3.9234543670264967, + "grad_norm": 1.0138685703277588, + "learning_rate": 7.963470313884011e-05, + "loss": 0.0704, + "step": 59970 + }, + { + "epoch": 3.9241086032057573, + "grad_norm": 0.9098789095878601, + "learning_rate": 7.962730402564924e-05, + "loss": 0.0663, + "step": 59980 + }, + { + "epoch": 3.924762839385018, + "grad_norm": 0.9759104251861572, + "learning_rate": 7.961990391245911e-05, + "loss": 0.0704, + "step": 59990 + }, + { + "epoch": 3.9254170755642788, + "grad_norm": 0.8418338894844055, + "learning_rate": 7.961250279951956e-05, + "loss": 0.066, + "step": 60000 + }, + { + "epoch": 3.9260713117435393, + "grad_norm": 0.8269692659378052, + "learning_rate": 7.960510068708039e-05, + "loss": 0.0734, + "step": 60010 + }, + { + "epoch": 3.9267255479228003, + "grad_norm": 1.259716510772705, + "learning_rate": 7.959769757539142e-05, + "loss": 0.0765, + "step": 60020 + }, + { + "epoch": 3.927379784102061, + "grad_norm": 0.7313357591629028, + "learning_rate": 7.959029346470252e-05, + "loss": 0.0689, + "step": 60030 + }, + { + "epoch": 3.9280340202813218, + "grad_norm": 1.043642282485962, + "learning_rate": 7.958288835526362e-05, + "loss": 0.0626, + "step": 60040 + }, + { + "epoch": 3.9286882564605823, + "grad_norm": 1.0208543539047241, + "learning_rate": 7.957548224732467e-05, + "loss": 0.0666, + "step": 60050 + }, + { + "epoch": 3.929342492639843, + "grad_norm": 0.9676765203475952, + "learning_rate": 7.956807514113562e-05, + "loss": 0.0722, + "step": 60060 + }, + { + "epoch": 3.929996728819104, + "grad_norm": 0.8631986975669861, + "learning_rate": 7.956066703694647e-05, + "loss": 0.0663, + "step": 60070 + }, + { + "epoch": 3.9306509649983643, + "grad_norm": 0.9647482633590698, + "learning_rate": 7.95532579350073e-05, + "loss": 0.064, + "step": 60080 + }, + { + "epoch": 3.9313052011776253, + "grad_norm": 0.7296038866043091, + "learning_rate": 7.954584783556818e-05, + "loss": 0.0616, + "step": 60090 + }, + { + "epoch": 3.931959437356886, + "grad_norm": 0.8985334038734436, + "learning_rate": 7.953843673887919e-05, + "loss": 0.0698, + "step": 60100 + }, + { + "epoch": 3.9326136735361468, + "grad_norm": 0.88104248046875, + "learning_rate": 7.953102464519049e-05, + "loss": 0.0714, + "step": 60110 + }, + { + "epoch": 3.9332679097154073, + "grad_norm": 0.8251603245735168, + "learning_rate": 7.952361155475228e-05, + "loss": 0.0693, + "step": 60120 + }, + { + "epoch": 3.933922145894668, + "grad_norm": 0.8818602561950684, + "learning_rate": 7.951619746781474e-05, + "loss": 0.0675, + "step": 60130 + }, + { + "epoch": 3.934576382073929, + "grad_norm": 0.7289047837257385, + "learning_rate": 7.950878238462812e-05, + "loss": 0.0592, + "step": 60140 + }, + { + "epoch": 3.9352306182531893, + "grad_norm": 1.0739281177520752, + "learning_rate": 7.950136630544272e-05, + "loss": 0.0726, + "step": 60150 + }, + { + "epoch": 3.93588485443245, + "grad_norm": 0.9551029801368713, + "learning_rate": 7.949394923050882e-05, + "loss": 0.0606, + "step": 60160 + }, + { + "epoch": 3.936539090611711, + "grad_norm": 0.7134016752243042, + "learning_rate": 7.94865311600768e-05, + "loss": 0.0611, + "step": 60170 + }, + { + "epoch": 3.937193326790972, + "grad_norm": 0.8816996216773987, + "learning_rate": 7.9479112094397e-05, + "loss": 0.0675, + "step": 60180 + }, + { + "epoch": 3.9378475629702323, + "grad_norm": 1.0640590190887451, + "learning_rate": 7.947169203371986e-05, + "loss": 0.0742, + "step": 60190 + }, + { + "epoch": 3.938501799149493, + "grad_norm": 0.8332981467247009, + "learning_rate": 7.946427097829584e-05, + "loss": 0.0756, + "step": 60200 + }, + { + "epoch": 3.939156035328754, + "grad_norm": 0.9748693704605103, + "learning_rate": 7.94568489283754e-05, + "loss": 0.066, + "step": 60210 + }, + { + "epoch": 3.9398102715080143, + "grad_norm": 0.9146521687507629, + "learning_rate": 7.944942588420903e-05, + "loss": 0.0652, + "step": 60220 + }, + { + "epoch": 3.940464507687275, + "grad_norm": 0.869734525680542, + "learning_rate": 7.944200184604732e-05, + "loss": 0.0743, + "step": 60230 + }, + { + "epoch": 3.941118743866536, + "grad_norm": 1.1516258716583252, + "learning_rate": 7.943457681414084e-05, + "loss": 0.0722, + "step": 60240 + }, + { + "epoch": 3.941772980045797, + "grad_norm": 0.8145464658737183, + "learning_rate": 7.942715078874019e-05, + "loss": 0.0613, + "step": 60250 + }, + { + "epoch": 3.9424272162250573, + "grad_norm": 1.0588626861572266, + "learning_rate": 7.941972377009601e-05, + "loss": 0.0701, + "step": 60260 + }, + { + "epoch": 3.943081452404318, + "grad_norm": 0.8844826221466064, + "learning_rate": 7.941229575845903e-05, + "loss": 0.0676, + "step": 60270 + }, + { + "epoch": 3.943735688583579, + "grad_norm": 0.7616518139839172, + "learning_rate": 7.94048667540799e-05, + "loss": 0.0708, + "step": 60280 + }, + { + "epoch": 3.9443899247628393, + "grad_norm": 1.0263601541519165, + "learning_rate": 7.939743675720942e-05, + "loss": 0.0709, + "step": 60290 + }, + { + "epoch": 3.9450441609421, + "grad_norm": 0.725226640701294, + "learning_rate": 7.939000576809834e-05, + "loss": 0.0628, + "step": 60300 + }, + { + "epoch": 3.945698397121361, + "grad_norm": 0.8320260047912598, + "learning_rate": 7.93825737869975e-05, + "loss": 0.0657, + "step": 60310 + }, + { + "epoch": 3.9463526333006214, + "grad_norm": 0.9669265747070312, + "learning_rate": 7.937514081415773e-05, + "loss": 0.0732, + "step": 60320 + }, + { + "epoch": 3.9470068694798823, + "grad_norm": 1.055615782737732, + "learning_rate": 7.936770684982992e-05, + "loss": 0.0672, + "step": 60330 + }, + { + "epoch": 3.947661105659143, + "grad_norm": 0.8739088177680969, + "learning_rate": 7.936027189426497e-05, + "loss": 0.069, + "step": 60340 + }, + { + "epoch": 3.948315341838404, + "grad_norm": 1.0369230508804321, + "learning_rate": 7.935283594771385e-05, + "loss": 0.0676, + "step": 60350 + }, + { + "epoch": 3.9489695780176644, + "grad_norm": 0.9298377633094788, + "learning_rate": 7.934539901042754e-05, + "loss": 0.0865, + "step": 60360 + }, + { + "epoch": 3.949623814196925, + "grad_norm": 1.1504807472229004, + "learning_rate": 7.933796108265705e-05, + "loss": 0.06, + "step": 60370 + }, + { + "epoch": 3.950278050376186, + "grad_norm": 0.8967733383178711, + "learning_rate": 7.933052216465345e-05, + "loss": 0.0642, + "step": 60380 + }, + { + "epoch": 3.9509322865554464, + "grad_norm": 0.9686173796653748, + "learning_rate": 7.932308225666779e-05, + "loss": 0.071, + "step": 60390 + }, + { + "epoch": 3.9515865227347073, + "grad_norm": 1.0625016689300537, + "learning_rate": 7.93156413589512e-05, + "loss": 0.065, + "step": 60400 + }, + { + "epoch": 3.952240758913968, + "grad_norm": 0.830640971660614, + "learning_rate": 7.930819947175484e-05, + "loss": 0.0665, + "step": 60410 + }, + { + "epoch": 3.952894995093229, + "grad_norm": 0.9542146921157837, + "learning_rate": 7.930075659532987e-05, + "loss": 0.0593, + "step": 60420 + }, + { + "epoch": 3.9535492312724894, + "grad_norm": 0.8056615591049194, + "learning_rate": 7.929331272992753e-05, + "loss": 0.0647, + "step": 60430 + }, + { + "epoch": 3.95420346745175, + "grad_norm": 0.9513580203056335, + "learning_rate": 7.928586787579904e-05, + "loss": 0.0679, + "step": 60440 + }, + { + "epoch": 3.954857703631011, + "grad_norm": 0.8633571267127991, + "learning_rate": 7.927842203319573e-05, + "loss": 0.0692, + "step": 60450 + }, + { + "epoch": 3.9555119398102714, + "grad_norm": 1.0166418552398682, + "learning_rate": 7.927097520236888e-05, + "loss": 0.0779, + "step": 60460 + }, + { + "epoch": 3.9561661759895324, + "grad_norm": 0.9727340340614319, + "learning_rate": 7.926352738356986e-05, + "loss": 0.0689, + "step": 60470 + }, + { + "epoch": 3.956820412168793, + "grad_norm": 1.02824866771698, + "learning_rate": 7.925607857705003e-05, + "loss": 0.0735, + "step": 60480 + }, + { + "epoch": 3.957474648348054, + "grad_norm": 0.790596604347229, + "learning_rate": 7.924862878306083e-05, + "loss": 0.0734, + "step": 60490 + }, + { + "epoch": 3.9581288845273144, + "grad_norm": 1.0034263134002686, + "learning_rate": 7.924117800185372e-05, + "loss": 0.0716, + "step": 60500 + }, + { + "epoch": 3.958783120706575, + "grad_norm": 0.8736487627029419, + "learning_rate": 7.923372623368013e-05, + "loss": 0.0647, + "step": 60510 + }, + { + "epoch": 3.959437356885836, + "grad_norm": 0.8763427734375, + "learning_rate": 7.922627347879162e-05, + "loss": 0.0682, + "step": 60520 + }, + { + "epoch": 3.9600915930650964, + "grad_norm": 0.7622628808021545, + "learning_rate": 7.921881973743974e-05, + "loss": 0.0625, + "step": 60530 + }, + { + "epoch": 3.9607458292443574, + "grad_norm": 0.9626938104629517, + "learning_rate": 7.921136500987607e-05, + "loss": 0.0804, + "step": 60540 + }, + { + "epoch": 3.961400065423618, + "grad_norm": 0.8307819962501526, + "learning_rate": 7.920390929635221e-05, + "loss": 0.0618, + "step": 60550 + }, + { + "epoch": 3.962054301602879, + "grad_norm": 1.078370213508606, + "learning_rate": 7.919645259711982e-05, + "loss": 0.0683, + "step": 60560 + }, + { + "epoch": 3.9627085377821394, + "grad_norm": 0.7358745336532593, + "learning_rate": 7.918899491243059e-05, + "loss": 0.0674, + "step": 60570 + }, + { + "epoch": 3.9633627739614, + "grad_norm": 0.9152573347091675, + "learning_rate": 7.918153624253624e-05, + "loss": 0.0735, + "step": 60580 + }, + { + "epoch": 3.964017010140661, + "grad_norm": 1.0013970136642456, + "learning_rate": 7.91740765876885e-05, + "loss": 0.0639, + "step": 60590 + }, + { + "epoch": 3.9646712463199214, + "grad_norm": 0.7813799381256104, + "learning_rate": 7.916661594813915e-05, + "loss": 0.0686, + "step": 60600 + }, + { + "epoch": 3.965325482499182, + "grad_norm": 1.0708343982696533, + "learning_rate": 7.915915432414005e-05, + "loss": 0.0627, + "step": 60610 + }, + { + "epoch": 3.965979718678443, + "grad_norm": 0.8936020731925964, + "learning_rate": 7.915169171594299e-05, + "loss": 0.0745, + "step": 60620 + }, + { + "epoch": 3.966633954857704, + "grad_norm": 1.0120549201965332, + "learning_rate": 7.914422812379989e-05, + "loss": 0.0654, + "step": 60630 + }, + { + "epoch": 3.9672881910369644, + "grad_norm": 0.7676621079444885, + "learning_rate": 7.913676354796267e-05, + "loss": 0.0603, + "step": 60640 + }, + { + "epoch": 3.967942427216225, + "grad_norm": 0.7795009613037109, + "learning_rate": 7.912929798868324e-05, + "loss": 0.0684, + "step": 60650 + }, + { + "epoch": 3.968596663395486, + "grad_norm": 0.9069650769233704, + "learning_rate": 7.912183144621364e-05, + "loss": 0.0792, + "step": 60660 + }, + { + "epoch": 3.9692508995747464, + "grad_norm": 0.9909687638282776, + "learning_rate": 7.911436392080585e-05, + "loss": 0.0599, + "step": 60670 + }, + { + "epoch": 3.969905135754007, + "grad_norm": 0.9287083745002747, + "learning_rate": 7.91068954127119e-05, + "loss": 0.066, + "step": 60680 + }, + { + "epoch": 3.970559371933268, + "grad_norm": 0.9105084538459778, + "learning_rate": 7.909942592218391e-05, + "loss": 0.069, + "step": 60690 + }, + { + "epoch": 3.971213608112529, + "grad_norm": 0.8097362518310547, + "learning_rate": 7.909195544947398e-05, + "loss": 0.0583, + "step": 60700 + }, + { + "epoch": 3.9718678442917894, + "grad_norm": 0.8813385367393494, + "learning_rate": 7.908448399483423e-05, + "loss": 0.0726, + "step": 60710 + }, + { + "epoch": 3.97252208047105, + "grad_norm": 0.7956545352935791, + "learning_rate": 7.907701155851691e-05, + "loss": 0.0611, + "step": 60720 + }, + { + "epoch": 3.973176316650311, + "grad_norm": 0.8885143995285034, + "learning_rate": 7.906953814077417e-05, + "loss": 0.0635, + "step": 60730 + }, + { + "epoch": 3.9738305528295714, + "grad_norm": 1.125998854637146, + "learning_rate": 7.906206374185828e-05, + "loss": 0.0778, + "step": 60740 + }, + { + "epoch": 3.974484789008832, + "grad_norm": 0.6007951498031616, + "learning_rate": 7.905458836202153e-05, + "loss": 0.0584, + "step": 60750 + }, + { + "epoch": 3.975139025188093, + "grad_norm": 1.0695325136184692, + "learning_rate": 7.904711200151622e-05, + "loss": 0.0726, + "step": 60760 + }, + { + "epoch": 3.9757932613673534, + "grad_norm": 0.9608235359191895, + "learning_rate": 7.90396346605947e-05, + "loss": 0.068, + "step": 60770 + }, + { + "epoch": 3.9764474975466144, + "grad_norm": 0.7435832023620605, + "learning_rate": 7.903215633950934e-05, + "loss": 0.076, + "step": 60780 + }, + { + "epoch": 3.977101733725875, + "grad_norm": 0.7819331288337708, + "learning_rate": 7.902467703851258e-05, + "loss": 0.0699, + "step": 60790 + }, + { + "epoch": 3.977755969905136, + "grad_norm": 0.7933452129364014, + "learning_rate": 7.901719675785685e-05, + "loss": 0.0679, + "step": 60800 + }, + { + "epoch": 3.9784102060843964, + "grad_norm": 0.8844022154808044, + "learning_rate": 7.900971549779461e-05, + "loss": 0.0693, + "step": 60810 + }, + { + "epoch": 3.979064442263657, + "grad_norm": 1.0027797222137451, + "learning_rate": 7.90022332585784e-05, + "loss": 0.0745, + "step": 60820 + }, + { + "epoch": 3.979718678442918, + "grad_norm": 0.8335436582565308, + "learning_rate": 7.899475004046078e-05, + "loss": 0.062, + "step": 60830 + }, + { + "epoch": 3.9803729146221785, + "grad_norm": 1.0376518964767456, + "learning_rate": 7.898726584369427e-05, + "loss": 0.0655, + "step": 60840 + }, + { + "epoch": 3.9810271508014394, + "grad_norm": 0.9191878437995911, + "learning_rate": 7.897978066853155e-05, + "loss": 0.075, + "step": 60850 + }, + { + "epoch": 3.9816813869807, + "grad_norm": 0.7158858180046082, + "learning_rate": 7.897229451522521e-05, + "loss": 0.0654, + "step": 60860 + }, + { + "epoch": 3.982335623159961, + "grad_norm": 0.9076741337776184, + "learning_rate": 7.896480738402795e-05, + "loss": 0.067, + "step": 60870 + }, + { + "epoch": 3.9829898593392215, + "grad_norm": 0.9363234043121338, + "learning_rate": 7.895731927519248e-05, + "loss": 0.0638, + "step": 60880 + }, + { + "epoch": 3.983644095518482, + "grad_norm": 0.9065988659858704, + "learning_rate": 7.894983018897153e-05, + "loss": 0.0649, + "step": 60890 + }, + { + "epoch": 3.984298331697743, + "grad_norm": 0.8118206858634949, + "learning_rate": 7.89423401256179e-05, + "loss": 0.074, + "step": 60900 + }, + { + "epoch": 3.9849525678770035, + "grad_norm": 1.0652772188186646, + "learning_rate": 7.893484908538437e-05, + "loss": 0.0632, + "step": 60910 + }, + { + "epoch": 3.9856068040562644, + "grad_norm": 0.8931100964546204, + "learning_rate": 7.892735706852381e-05, + "loss": 0.0689, + "step": 60920 + }, + { + "epoch": 3.986261040235525, + "grad_norm": 0.8381471633911133, + "learning_rate": 7.891986407528908e-05, + "loss": 0.0657, + "step": 60930 + }, + { + "epoch": 3.986915276414786, + "grad_norm": 1.1571853160858154, + "learning_rate": 7.89123701059331e-05, + "loss": 0.0668, + "step": 60940 + }, + { + "epoch": 3.9875695125940465, + "grad_norm": 0.904621422290802, + "learning_rate": 7.890487516070881e-05, + "loss": 0.0755, + "step": 60950 + }, + { + "epoch": 3.988223748773307, + "grad_norm": 0.9490391612052917, + "learning_rate": 7.889737923986918e-05, + "loss": 0.0705, + "step": 60960 + }, + { + "epoch": 3.988877984952568, + "grad_norm": 1.0775212049484253, + "learning_rate": 7.888988234366719e-05, + "loss": 0.0643, + "step": 60970 + }, + { + "epoch": 3.9895322211318285, + "grad_norm": 0.9176508784294128, + "learning_rate": 7.888238447235592e-05, + "loss": 0.0724, + "step": 60980 + }, + { + "epoch": 3.9901864573110895, + "grad_norm": 1.2155365943908691, + "learning_rate": 7.887488562618844e-05, + "loss": 0.0776, + "step": 60990 + }, + { + "epoch": 3.99084069349035, + "grad_norm": 1.0456100702285767, + "learning_rate": 7.886738580541782e-05, + "loss": 0.065, + "step": 61000 + }, + { + "epoch": 3.991494929669611, + "grad_norm": 0.962174117565155, + "learning_rate": 7.885988501029724e-05, + "loss": 0.0642, + "step": 61010 + }, + { + "epoch": 3.9921491658488715, + "grad_norm": 0.8093088269233704, + "learning_rate": 7.885238324107982e-05, + "loss": 0.0724, + "step": 61020 + }, + { + "epoch": 3.992803402028132, + "grad_norm": 0.9736089706420898, + "learning_rate": 7.884488049801882e-05, + "loss": 0.0679, + "step": 61030 + }, + { + "epoch": 3.993457638207393, + "grad_norm": 0.9136343002319336, + "learning_rate": 7.883737678136746e-05, + "loss": 0.0578, + "step": 61040 + }, + { + "epoch": 3.9941118743866535, + "grad_norm": 0.998466968536377, + "learning_rate": 7.8829872091379e-05, + "loss": 0.0724, + "step": 61050 + }, + { + "epoch": 3.994766110565914, + "grad_norm": 0.8486067652702332, + "learning_rate": 7.882236642830675e-05, + "loss": 0.0747, + "step": 61060 + }, + { + "epoch": 3.995420346745175, + "grad_norm": 0.8077092170715332, + "learning_rate": 7.881485979240404e-05, + "loss": 0.0668, + "step": 61070 + }, + { + "epoch": 3.996074582924436, + "grad_norm": 1.0443347692489624, + "learning_rate": 7.880735218392423e-05, + "loss": 0.068, + "step": 61080 + }, + { + "epoch": 3.9967288191036965, + "grad_norm": 0.664110541343689, + "learning_rate": 7.879984360312077e-05, + "loss": 0.0608, + "step": 61090 + }, + { + "epoch": 3.997383055282957, + "grad_norm": 0.8030071258544922, + "learning_rate": 7.879233405024702e-05, + "loss": 0.0696, + "step": 61100 + }, + { + "epoch": 3.998037291462218, + "grad_norm": 0.9400510191917419, + "learning_rate": 7.87848235255565e-05, + "loss": 0.0756, + "step": 61110 + }, + { + "epoch": 3.9986915276414785, + "grad_norm": 0.8629300594329834, + "learning_rate": 7.87773120293027e-05, + "loss": 0.0779, + "step": 61120 + }, + { + "epoch": 3.999345763820739, + "grad_norm": 0.8903558850288391, + "learning_rate": 7.876979956173914e-05, + "loss": 0.0825, + "step": 61130 + }, + { + "epoch": 4.0, + "grad_norm": 1.0015478134155273, + "learning_rate": 7.87622861231194e-05, + "loss": 0.0693, + "step": 61140 + }, + { + "epoch": 4.000654236179261, + "grad_norm": 0.8407610654830933, + "learning_rate": 7.875477171369707e-05, + "loss": 0.0606, + "step": 61150 + }, + { + "epoch": 4.001308472358521, + "grad_norm": 1.2632551193237305, + "learning_rate": 7.874725633372577e-05, + "loss": 0.0688, + "step": 61160 + }, + { + "epoch": 4.001962708537782, + "grad_norm": 0.812816858291626, + "learning_rate": 7.87397399834592e-05, + "loss": 0.067, + "step": 61170 + }, + { + "epoch": 4.002616944717043, + "grad_norm": 0.790305495262146, + "learning_rate": 7.873222266315101e-05, + "loss": 0.0604, + "step": 61180 + }, + { + "epoch": 4.003271180896304, + "grad_norm": 0.8392609357833862, + "learning_rate": 7.872470437305496e-05, + "loss": 0.0706, + "step": 61190 + }, + { + "epoch": 4.003925417075564, + "grad_norm": 0.8045458793640137, + "learning_rate": 7.87171851134248e-05, + "loss": 0.0696, + "step": 61200 + }, + { + "epoch": 4.004579653254825, + "grad_norm": 0.9475613832473755, + "learning_rate": 7.870966488451434e-05, + "loss": 0.0672, + "step": 61210 + }, + { + "epoch": 4.005233889434086, + "grad_norm": 0.8142966032028198, + "learning_rate": 7.87021436865774e-05, + "loss": 0.0612, + "step": 61220 + }, + { + "epoch": 4.005888125613346, + "grad_norm": 0.8628994822502136, + "learning_rate": 7.869462151986781e-05, + "loss": 0.0615, + "step": 61230 + }, + { + "epoch": 4.006542361792607, + "grad_norm": 0.8791362643241882, + "learning_rate": 7.868709838463952e-05, + "loss": 0.0711, + "step": 61240 + }, + { + "epoch": 4.007196597971868, + "grad_norm": 1.1582494974136353, + "learning_rate": 7.867957428114641e-05, + "loss": 0.0688, + "step": 61250 + }, + { + "epoch": 4.007850834151129, + "grad_norm": 0.9910046458244324, + "learning_rate": 7.867204920964245e-05, + "loss": 0.0659, + "step": 61260 + }, + { + "epoch": 4.008505070330389, + "grad_norm": 0.832694411277771, + "learning_rate": 7.866452317038164e-05, + "loss": 0.0706, + "step": 61270 + }, + { + "epoch": 4.00915930650965, + "grad_norm": 0.8842899203300476, + "learning_rate": 7.865699616361798e-05, + "loss": 0.0744, + "step": 61280 + }, + { + "epoch": 4.009813542688911, + "grad_norm": 0.9337363839149475, + "learning_rate": 7.864946818960557e-05, + "loss": 0.0715, + "step": 61290 + }, + { + "epoch": 4.010467778868171, + "grad_norm": 0.8401015400886536, + "learning_rate": 7.864193924859846e-05, + "loss": 0.0635, + "step": 61300 + }, + { + "epoch": 4.011122015047432, + "grad_norm": 0.9122945666313171, + "learning_rate": 7.86344093408508e-05, + "loss": 0.0663, + "step": 61310 + }, + { + "epoch": 4.011776251226693, + "grad_norm": 0.809889018535614, + "learning_rate": 7.862687846661671e-05, + "loss": 0.0704, + "step": 61320 + }, + { + "epoch": 4.012430487405954, + "grad_norm": 0.9183257818222046, + "learning_rate": 7.86193466261504e-05, + "loss": 0.0702, + "step": 61330 + }, + { + "epoch": 4.013084723585214, + "grad_norm": 0.874580979347229, + "learning_rate": 7.861181381970608e-05, + "loss": 0.072, + "step": 61340 + }, + { + "epoch": 4.013738959764475, + "grad_norm": 0.7762249708175659, + "learning_rate": 7.860428004753801e-05, + "loss": 0.0557, + "step": 61350 + }, + { + "epoch": 4.014393195943736, + "grad_norm": 0.7530931830406189, + "learning_rate": 7.859674530990047e-05, + "loss": 0.0666, + "step": 61360 + }, + { + "epoch": 4.015047432122996, + "grad_norm": 0.9463986158370972, + "learning_rate": 7.858920960704779e-05, + "loss": 0.0644, + "step": 61370 + }, + { + "epoch": 4.015701668302257, + "grad_norm": 0.828858494758606, + "learning_rate": 7.85816729392343e-05, + "loss": 0.0718, + "step": 61380 + }, + { + "epoch": 4.016355904481518, + "grad_norm": 0.7672846913337708, + "learning_rate": 7.857413530671438e-05, + "loss": 0.0615, + "step": 61390 + }, + { + "epoch": 4.017010140660778, + "grad_norm": 1.026138186454773, + "learning_rate": 7.856659670974246e-05, + "loss": 0.0762, + "step": 61400 + }, + { + "epoch": 4.017664376840039, + "grad_norm": 0.8137958645820618, + "learning_rate": 7.855905714857299e-05, + "loss": 0.0633, + "step": 61410 + }, + { + "epoch": 4.0183186130193, + "grad_norm": 0.9455887079238892, + "learning_rate": 7.855151662346043e-05, + "loss": 0.0701, + "step": 61420 + }, + { + "epoch": 4.018972849198561, + "grad_norm": 0.8994813561439514, + "learning_rate": 7.854397513465932e-05, + "loss": 0.0601, + "step": 61430 + }, + { + "epoch": 4.019627085377821, + "grad_norm": 0.7181370854377747, + "learning_rate": 7.853643268242417e-05, + "loss": 0.062, + "step": 61440 + }, + { + "epoch": 4.020281321557082, + "grad_norm": 0.7311094403266907, + "learning_rate": 7.852888926700959e-05, + "loss": 0.0645, + "step": 61450 + }, + { + "epoch": 4.020935557736343, + "grad_norm": 0.8224121928215027, + "learning_rate": 7.852134488867018e-05, + "loss": 0.0667, + "step": 61460 + }, + { + "epoch": 4.021589793915603, + "grad_norm": 1.1656960248947144, + "learning_rate": 7.851379954766058e-05, + "loss": 0.0815, + "step": 61470 + }, + { + "epoch": 4.022244030094864, + "grad_norm": 0.7743502259254456, + "learning_rate": 7.850625324423546e-05, + "loss": 0.0586, + "step": 61480 + }, + { + "epoch": 4.022898266274125, + "grad_norm": 0.8907047510147095, + "learning_rate": 7.849870597864953e-05, + "loss": 0.0665, + "step": 61490 + }, + { + "epoch": 4.023552502453386, + "grad_norm": 1.395200490951538, + "learning_rate": 7.849115775115755e-05, + "loss": 0.0752, + "step": 61500 + }, + { + "epoch": 4.024206738632646, + "grad_norm": 0.8665695190429688, + "learning_rate": 7.848360856201425e-05, + "loss": 0.066, + "step": 61510 + }, + { + "epoch": 4.024860974811907, + "grad_norm": 1.0678719282150269, + "learning_rate": 7.847605841147447e-05, + "loss": 0.0763, + "step": 61520 + }, + { + "epoch": 4.025515210991168, + "grad_norm": 0.961864173412323, + "learning_rate": 7.846850729979304e-05, + "loss": 0.0756, + "step": 61530 + }, + { + "epoch": 4.026169447170428, + "grad_norm": 0.9026308655738831, + "learning_rate": 7.846095522722482e-05, + "loss": 0.0656, + "step": 61540 + }, + { + "epoch": 4.026823683349689, + "grad_norm": 0.8607982397079468, + "learning_rate": 7.845340219402472e-05, + "loss": 0.0614, + "step": 61550 + }, + { + "epoch": 4.02747791952895, + "grad_norm": 0.8916060924530029, + "learning_rate": 7.844584820044769e-05, + "loss": 0.0714, + "step": 61560 + }, + { + "epoch": 4.028132155708211, + "grad_norm": 0.7897719144821167, + "learning_rate": 7.843829324674867e-05, + "loss": 0.0649, + "step": 61570 + }, + { + "epoch": 4.028786391887471, + "grad_norm": 0.8527302742004395, + "learning_rate": 7.843073733318268e-05, + "loss": 0.0782, + "step": 61580 + }, + { + "epoch": 4.029440628066732, + "grad_norm": 0.8170759677886963, + "learning_rate": 7.842318046000475e-05, + "loss": 0.0603, + "step": 61590 + }, + { + "epoch": 4.030094864245993, + "grad_norm": 0.9056137204170227, + "learning_rate": 7.841562262746991e-05, + "loss": 0.064, + "step": 61600 + }, + { + "epoch": 4.030749100425253, + "grad_norm": 0.9502385258674622, + "learning_rate": 7.84080638358333e-05, + "loss": 0.0672, + "step": 61610 + }, + { + "epoch": 4.031403336604514, + "grad_norm": 0.964516818523407, + "learning_rate": 7.840050408535002e-05, + "loss": 0.0765, + "step": 61620 + }, + { + "epoch": 4.032057572783775, + "grad_norm": 1.0085734128952026, + "learning_rate": 7.839294337627525e-05, + "loss": 0.0691, + "step": 61630 + }, + { + "epoch": 4.032711808963036, + "grad_norm": 0.9676178097724915, + "learning_rate": 7.838538170886419e-05, + "loss": 0.0619, + "step": 61640 + }, + { + "epoch": 4.033366045142296, + "grad_norm": 1.0016093254089355, + "learning_rate": 7.837781908337204e-05, + "loss": 0.0653, + "step": 61650 + }, + { + "epoch": 4.034020281321557, + "grad_norm": 0.9340269565582275, + "learning_rate": 7.837025550005408e-05, + "loss": 0.0682, + "step": 61660 + }, + { + "epoch": 4.034674517500818, + "grad_norm": 0.9540444612503052, + "learning_rate": 7.836269095916557e-05, + "loss": 0.0739, + "step": 61670 + }, + { + "epoch": 4.035328753680078, + "grad_norm": 0.7741513848304749, + "learning_rate": 7.835512546096188e-05, + "loss": 0.0704, + "step": 61680 + }, + { + "epoch": 4.035982989859339, + "grad_norm": 0.8669870495796204, + "learning_rate": 7.834755900569834e-05, + "loss": 0.0625, + "step": 61690 + }, + { + "epoch": 4.0366372260386, + "grad_norm": 0.8938657641410828, + "learning_rate": 7.833999159363035e-05, + "loss": 0.0677, + "step": 61700 + }, + { + "epoch": 4.037291462217861, + "grad_norm": 0.7298368215560913, + "learning_rate": 7.83324232250133e-05, + "loss": 0.0606, + "step": 61710 + }, + { + "epoch": 4.037945698397121, + "grad_norm": 0.8773128986358643, + "learning_rate": 7.832485390010266e-05, + "loss": 0.0631, + "step": 61720 + }, + { + "epoch": 4.038599934576382, + "grad_norm": 0.8813597559928894, + "learning_rate": 7.831728361915394e-05, + "loss": 0.0735, + "step": 61730 + }, + { + "epoch": 4.039254170755643, + "grad_norm": 0.7509139776229858, + "learning_rate": 7.830971238242261e-05, + "loss": 0.0666, + "step": 61740 + }, + { + "epoch": 4.039908406934903, + "grad_norm": 0.8333325386047363, + "learning_rate": 7.830214019016426e-05, + "loss": 0.0649, + "step": 61750 + }, + { + "epoch": 4.040562643114164, + "grad_norm": 0.9280720353126526, + "learning_rate": 7.829456704263442e-05, + "loss": 0.06, + "step": 61760 + }, + { + "epoch": 4.041216879293425, + "grad_norm": 0.6866008639335632, + "learning_rate": 7.828699294008877e-05, + "loss": 0.066, + "step": 61770 + }, + { + "epoch": 4.041871115472686, + "grad_norm": 0.8785973787307739, + "learning_rate": 7.827941788278292e-05, + "loss": 0.0582, + "step": 61780 + }, + { + "epoch": 4.042525351651946, + "grad_norm": 0.9217095971107483, + "learning_rate": 7.827184187097253e-05, + "loss": 0.0737, + "step": 61790 + }, + { + "epoch": 4.043179587831207, + "grad_norm": 0.7473239302635193, + "learning_rate": 7.826426490491335e-05, + "loss": 0.0639, + "step": 61800 + }, + { + "epoch": 4.043833824010468, + "grad_norm": 0.8309671878814697, + "learning_rate": 7.82566869848611e-05, + "loss": 0.0633, + "step": 61810 + }, + { + "epoch": 4.044488060189728, + "grad_norm": 1.0445795059204102, + "learning_rate": 7.824910811107156e-05, + "loss": 0.0683, + "step": 61820 + }, + { + "epoch": 4.045142296368989, + "grad_norm": 0.8129024505615234, + "learning_rate": 7.824152828380053e-05, + "loss": 0.0628, + "step": 61830 + }, + { + "epoch": 4.04579653254825, + "grad_norm": 0.8035739660263062, + "learning_rate": 7.823394750330387e-05, + "loss": 0.0664, + "step": 61840 + }, + { + "epoch": 4.04645076872751, + "grad_norm": 1.1228084564208984, + "learning_rate": 7.822636576983741e-05, + "loss": 0.0663, + "step": 61850 + }, + { + "epoch": 4.047105004906771, + "grad_norm": 0.9300438761711121, + "learning_rate": 7.821878308365708e-05, + "loss": 0.0679, + "step": 61860 + }, + { + "epoch": 4.047759241086032, + "grad_norm": 0.8193831443786621, + "learning_rate": 7.821119944501885e-05, + "loss": 0.063, + "step": 61870 + }, + { + "epoch": 4.048413477265293, + "grad_norm": 0.8168728351593018, + "learning_rate": 7.820361485417862e-05, + "loss": 0.0709, + "step": 61880 + }, + { + "epoch": 4.049067713444553, + "grad_norm": 1.0577526092529297, + "learning_rate": 7.819602931139243e-05, + "loss": 0.0641, + "step": 61890 + }, + { + "epoch": 4.049721949623814, + "grad_norm": 0.8535493612289429, + "learning_rate": 7.81884428169163e-05, + "loss": 0.057, + "step": 61900 + }, + { + "epoch": 4.050376185803075, + "grad_norm": 0.8765014410018921, + "learning_rate": 7.81808553710063e-05, + "loss": 0.0662, + "step": 61910 + }, + { + "epoch": 4.051030421982335, + "grad_norm": 0.9272754788398743, + "learning_rate": 7.817326697391853e-05, + "loss": 0.0657, + "step": 61920 + }, + { + "epoch": 4.051684658161596, + "grad_norm": 0.7854313254356384, + "learning_rate": 7.81656776259091e-05, + "loss": 0.068, + "step": 61930 + }, + { + "epoch": 4.052338894340857, + "grad_norm": 0.753021776676178, + "learning_rate": 7.81580873272342e-05, + "loss": 0.0648, + "step": 61940 + }, + { + "epoch": 4.052993130520118, + "grad_norm": 0.939609944820404, + "learning_rate": 7.815049607815e-05, + "loss": 0.0682, + "step": 61950 + }, + { + "epoch": 4.053647366699378, + "grad_norm": 0.9001626372337341, + "learning_rate": 7.814290387891271e-05, + "loss": 0.0608, + "step": 61960 + }, + { + "epoch": 4.054301602878639, + "grad_norm": 0.9824931621551514, + "learning_rate": 7.813531072977863e-05, + "loss": 0.069, + "step": 61970 + }, + { + "epoch": 4.0549558390579, + "grad_norm": 0.9642009139060974, + "learning_rate": 7.812771663100402e-05, + "loss": 0.0602, + "step": 61980 + }, + { + "epoch": 4.05561007523716, + "grad_norm": 0.7149823307991028, + "learning_rate": 7.812012158284521e-05, + "loss": 0.0667, + "step": 61990 + }, + { + "epoch": 4.056264311416421, + "grad_norm": 0.8310834765434265, + "learning_rate": 7.811252558555854e-05, + "loss": 0.0746, + "step": 62000 + }, + { + "epoch": 4.056918547595682, + "grad_norm": 0.7982217073440552, + "learning_rate": 7.810492863940041e-05, + "loss": 0.0852, + "step": 62010 + }, + { + "epoch": 4.057572783774943, + "grad_norm": 0.9088074564933777, + "learning_rate": 7.809733074462722e-05, + "loss": 0.064, + "step": 62020 + }, + { + "epoch": 4.058227019954203, + "grad_norm": 0.8191305994987488, + "learning_rate": 7.808973190149544e-05, + "loss": 0.0684, + "step": 62030 + }, + { + "epoch": 4.058881256133464, + "grad_norm": 0.8682653307914734, + "learning_rate": 7.808213211026153e-05, + "loss": 0.0623, + "step": 62040 + }, + { + "epoch": 4.059535492312725, + "grad_norm": 1.0228744745254517, + "learning_rate": 7.807453137118204e-05, + "loss": 0.0726, + "step": 62050 + }, + { + "epoch": 4.060189728491985, + "grad_norm": 0.8825092315673828, + "learning_rate": 7.806692968451346e-05, + "loss": 0.0669, + "step": 62060 + }, + { + "epoch": 4.060843964671246, + "grad_norm": 0.8798748254776001, + "learning_rate": 7.80593270505124e-05, + "loss": 0.0669, + "step": 62070 + }, + { + "epoch": 4.061498200850507, + "grad_norm": 0.8253244161605835, + "learning_rate": 7.805172346943547e-05, + "loss": 0.058, + "step": 62080 + }, + { + "epoch": 4.062152437029768, + "grad_norm": 0.9651762247085571, + "learning_rate": 7.804411894153932e-05, + "loss": 0.0709, + "step": 62090 + }, + { + "epoch": 4.062806673209028, + "grad_norm": 1.1336904764175415, + "learning_rate": 7.803651346708056e-05, + "loss": 0.072, + "step": 62100 + }, + { + "epoch": 4.063460909388289, + "grad_norm": 1.0221117734909058, + "learning_rate": 7.802890704631598e-05, + "loss": 0.0734, + "step": 62110 + }, + { + "epoch": 4.06411514556755, + "grad_norm": 0.8365046381950378, + "learning_rate": 7.802129967950227e-05, + "loss": 0.0662, + "step": 62120 + }, + { + "epoch": 4.06476938174681, + "grad_norm": 0.8922891020774841, + "learning_rate": 7.801369136689621e-05, + "loss": 0.065, + "step": 62130 + }, + { + "epoch": 4.065423617926071, + "grad_norm": 0.689914345741272, + "learning_rate": 7.80060821087546e-05, + "loss": 0.0674, + "step": 62140 + }, + { + "epoch": 4.066077854105332, + "grad_norm": 0.8463851809501648, + "learning_rate": 7.799847190533428e-05, + "loss": 0.0628, + "step": 62150 + }, + { + "epoch": 4.066732090284593, + "grad_norm": 0.7686126232147217, + "learning_rate": 7.799086075689208e-05, + "loss": 0.0665, + "step": 62160 + }, + { + "epoch": 4.067386326463853, + "grad_norm": 0.9086697697639465, + "learning_rate": 7.798324866368493e-05, + "loss": 0.0655, + "step": 62170 + }, + { + "epoch": 4.068040562643114, + "grad_norm": 0.8965178728103638, + "learning_rate": 7.797563562596974e-05, + "loss": 0.0722, + "step": 62180 + }, + { + "epoch": 4.068694798822375, + "grad_norm": 0.8882538676261902, + "learning_rate": 7.796802164400348e-05, + "loss": 0.0615, + "step": 62190 + }, + { + "epoch": 4.069349035001635, + "grad_norm": 0.8429491519927979, + "learning_rate": 7.796040671804316e-05, + "loss": 0.0628, + "step": 62200 + }, + { + "epoch": 4.070003271180896, + "grad_norm": 0.8130377531051636, + "learning_rate": 7.795279084834577e-05, + "loss": 0.0657, + "step": 62210 + }, + { + "epoch": 4.070657507360157, + "grad_norm": 0.9137871861457825, + "learning_rate": 7.794517403516838e-05, + "loss": 0.0606, + "step": 62220 + }, + { + "epoch": 4.071311743539418, + "grad_norm": 0.8658760190010071, + "learning_rate": 7.793755627876808e-05, + "loss": 0.0676, + "step": 62230 + }, + { + "epoch": 4.071965979718678, + "grad_norm": 0.9319032430648804, + "learning_rate": 7.7929937579402e-05, + "loss": 0.0693, + "step": 62240 + }, + { + "epoch": 4.072620215897939, + "grad_norm": 0.8796036839485168, + "learning_rate": 7.792231793732727e-05, + "loss": 0.0669, + "step": 62250 + }, + { + "epoch": 4.0732744520772, + "grad_norm": 0.7122048139572144, + "learning_rate": 7.791469735280106e-05, + "loss": 0.0639, + "step": 62260 + }, + { + "epoch": 4.07392868825646, + "grad_norm": 1.0055348873138428, + "learning_rate": 7.790707582608063e-05, + "loss": 0.0808, + "step": 62270 + }, + { + "epoch": 4.074582924435721, + "grad_norm": 0.8951319456100464, + "learning_rate": 7.78994533574232e-05, + "loss": 0.0677, + "step": 62280 + }, + { + "epoch": 4.075237160614982, + "grad_norm": 0.9522911906242371, + "learning_rate": 7.789182994708604e-05, + "loss": 0.0545, + "step": 62290 + }, + { + "epoch": 4.075891396794242, + "grad_norm": 0.8506065607070923, + "learning_rate": 7.788420559532646e-05, + "loss": 0.0595, + "step": 62300 + }, + { + "epoch": 4.076545632973503, + "grad_norm": 0.959356427192688, + "learning_rate": 7.787658030240183e-05, + "loss": 0.0713, + "step": 62310 + }, + { + "epoch": 4.077199869152764, + "grad_norm": 0.7769831418991089, + "learning_rate": 7.786895406856952e-05, + "loss": 0.0627, + "step": 62320 + }, + { + "epoch": 4.077854105332025, + "grad_norm": 0.6790773868560791, + "learning_rate": 7.786132689408688e-05, + "loss": 0.068, + "step": 62330 + }, + { + "epoch": 4.078508341511285, + "grad_norm": 0.7900551557540894, + "learning_rate": 7.78536987792114e-05, + "loss": 0.0749, + "step": 62340 + }, + { + "epoch": 4.079162577690546, + "grad_norm": 0.8323625922203064, + "learning_rate": 7.784606972420056e-05, + "loss": 0.0585, + "step": 62350 + }, + { + "epoch": 4.079816813869807, + "grad_norm": 1.0454132556915283, + "learning_rate": 7.783843972931184e-05, + "loss": 0.0613, + "step": 62360 + }, + { + "epoch": 4.080471050049067, + "grad_norm": 0.9331693053245544, + "learning_rate": 7.783080879480274e-05, + "loss": 0.0635, + "step": 62370 + }, + { + "epoch": 4.081125286228328, + "grad_norm": 0.9045494198799133, + "learning_rate": 7.782317692093088e-05, + "loss": 0.0703, + "step": 62380 + }, + { + "epoch": 4.081779522407589, + "grad_norm": 0.9631790518760681, + "learning_rate": 7.781554410795381e-05, + "loss": 0.0675, + "step": 62390 + }, + { + "epoch": 4.08243375858685, + "grad_norm": 0.9105682373046875, + "learning_rate": 7.78079103561292e-05, + "loss": 0.0779, + "step": 62400 + }, + { + "epoch": 4.08308799476611, + "grad_norm": 0.8074235320091248, + "learning_rate": 7.780027566571465e-05, + "loss": 0.07, + "step": 62410 + }, + { + "epoch": 4.083742230945371, + "grad_norm": 0.8759629726409912, + "learning_rate": 7.779264003696794e-05, + "loss": 0.0637, + "step": 62420 + }, + { + "epoch": 4.084396467124632, + "grad_norm": 0.8032673001289368, + "learning_rate": 7.77850034701467e-05, + "loss": 0.064, + "step": 62430 + }, + { + "epoch": 4.085050703303892, + "grad_norm": 0.8850411176681519, + "learning_rate": 7.777736596550874e-05, + "loss": 0.0641, + "step": 62440 + }, + { + "epoch": 4.085704939483153, + "grad_norm": 1.1167175769805908, + "learning_rate": 7.776972752331182e-05, + "loss": 0.0689, + "step": 62450 + }, + { + "epoch": 4.086359175662414, + "grad_norm": 0.9858927726745605, + "learning_rate": 7.776208814381379e-05, + "loss": 0.0725, + "step": 62460 + }, + { + "epoch": 4.087013411841675, + "grad_norm": 0.9899501204490662, + "learning_rate": 7.775444782727245e-05, + "loss": 0.0684, + "step": 62470 + }, + { + "epoch": 4.087667648020935, + "grad_norm": 0.7681850790977478, + "learning_rate": 7.77468065739457e-05, + "loss": 0.0633, + "step": 62480 + }, + { + "epoch": 4.088321884200196, + "grad_norm": 0.6893104314804077, + "learning_rate": 7.773916438409149e-05, + "loss": 0.0572, + "step": 62490 + }, + { + "epoch": 4.088976120379457, + "grad_norm": 0.7790807485580444, + "learning_rate": 7.773152125796772e-05, + "loss": 0.0634, + "step": 62500 + }, + { + "epoch": 4.089630356558717, + "grad_norm": 1.0509587526321411, + "learning_rate": 7.772387719583238e-05, + "loss": 0.063, + "step": 62510 + }, + { + "epoch": 4.090284592737978, + "grad_norm": 1.0617860555648804, + "learning_rate": 7.771623219794346e-05, + "loss": 0.061, + "step": 62520 + }, + { + "epoch": 4.090938828917239, + "grad_norm": 0.7485182881355286, + "learning_rate": 7.770858626455903e-05, + "loss": 0.0698, + "step": 62530 + }, + { + "epoch": 4.0915930650965, + "grad_norm": 0.7513982653617859, + "learning_rate": 7.770093939593716e-05, + "loss": 0.0646, + "step": 62540 + }, + { + "epoch": 4.09224730127576, + "grad_norm": 0.9014798998832703, + "learning_rate": 7.769329159233592e-05, + "loss": 0.0747, + "step": 62550 + }, + { + "epoch": 4.092901537455021, + "grad_norm": 1.199467658996582, + "learning_rate": 7.768564285401346e-05, + "loss": 0.0664, + "step": 62560 + }, + { + "epoch": 4.093555773634282, + "grad_norm": 0.9420590996742249, + "learning_rate": 7.767799318122794e-05, + "loss": 0.0718, + "step": 62570 + }, + { + "epoch": 4.094210009813542, + "grad_norm": 0.9552091360092163, + "learning_rate": 7.767034257423758e-05, + "loss": 0.0804, + "step": 62580 + }, + { + "epoch": 4.094864245992803, + "grad_norm": 1.0671602487564087, + "learning_rate": 7.766269103330057e-05, + "loss": 0.0742, + "step": 62590 + }, + { + "epoch": 4.095518482172064, + "grad_norm": 0.961463451385498, + "learning_rate": 7.76550385586752e-05, + "loss": 0.0706, + "step": 62600 + }, + { + "epoch": 4.096172718351325, + "grad_norm": 0.858124315738678, + "learning_rate": 7.764738515061975e-05, + "loss": 0.057, + "step": 62610 + }, + { + "epoch": 4.096826954530585, + "grad_norm": 0.830001950263977, + "learning_rate": 7.763973080939254e-05, + "loss": 0.0554, + "step": 62620 + }, + { + "epoch": 4.097481190709846, + "grad_norm": 0.9393772482872009, + "learning_rate": 7.763207553525193e-05, + "loss": 0.0708, + "step": 62630 + }, + { + "epoch": 4.098135426889107, + "grad_norm": 0.8235999941825867, + "learning_rate": 7.76244193284563e-05, + "loss": 0.0615, + "step": 62640 + }, + { + "epoch": 4.098789663068367, + "grad_norm": 1.2535772323608398, + "learning_rate": 7.761676218926408e-05, + "loss": 0.0678, + "step": 62650 + }, + { + "epoch": 4.099443899247628, + "grad_norm": 0.9056881070137024, + "learning_rate": 7.76091041179337e-05, + "loss": 0.0637, + "step": 62660 + }, + { + "epoch": 4.100098135426889, + "grad_norm": 1.1135672330856323, + "learning_rate": 7.760144511472365e-05, + "loss": 0.0627, + "step": 62670 + }, + { + "epoch": 4.10075237160615, + "grad_norm": 1.0625379085540771, + "learning_rate": 7.759378517989245e-05, + "loss": 0.0658, + "step": 62680 + }, + { + "epoch": 4.10140660778541, + "grad_norm": 0.947235107421875, + "learning_rate": 7.75861243136986e-05, + "loss": 0.0638, + "step": 62690 + }, + { + "epoch": 4.102060843964671, + "grad_norm": 0.715565025806427, + "learning_rate": 7.757846251640074e-05, + "loss": 0.0647, + "step": 62700 + }, + { + "epoch": 4.102715080143932, + "grad_norm": 0.9149342775344849, + "learning_rate": 7.757079978825744e-05, + "loss": 0.0605, + "step": 62710 + }, + { + "epoch": 4.103369316323192, + "grad_norm": 0.8739985227584839, + "learning_rate": 7.756313612952733e-05, + "loss": 0.0592, + "step": 62720 + }, + { + "epoch": 4.104023552502453, + "grad_norm": 1.1399884223937988, + "learning_rate": 7.755547154046908e-05, + "loss": 0.0712, + "step": 62730 + }, + { + "epoch": 4.104677788681714, + "grad_norm": 0.9350805878639221, + "learning_rate": 7.754780602134142e-05, + "loss": 0.063, + "step": 62740 + }, + { + "epoch": 4.105332024860974, + "grad_norm": 0.8487550616264343, + "learning_rate": 7.754013957240305e-05, + "loss": 0.0741, + "step": 62750 + }, + { + "epoch": 4.105986261040235, + "grad_norm": 0.7867029905319214, + "learning_rate": 7.753247219391273e-05, + "loss": 0.0604, + "step": 62760 + }, + { + "epoch": 4.106640497219496, + "grad_norm": 0.9640946388244629, + "learning_rate": 7.752480388612928e-05, + "loss": 0.063, + "step": 62770 + }, + { + "epoch": 4.107294733398757, + "grad_norm": 1.0482327938079834, + "learning_rate": 7.751713464931151e-05, + "loss": 0.0709, + "step": 62780 + }, + { + "epoch": 4.107948969578017, + "grad_norm": 0.8405677080154419, + "learning_rate": 7.75094644837183e-05, + "loss": 0.0572, + "step": 62790 + }, + { + "epoch": 4.108603205757278, + "grad_norm": 0.8008997440338135, + "learning_rate": 7.750179338960849e-05, + "loss": 0.0732, + "step": 62800 + }, + { + "epoch": 4.109257441936539, + "grad_norm": 0.8923006057739258, + "learning_rate": 7.749412136724103e-05, + "loss": 0.0666, + "step": 62810 + }, + { + "epoch": 4.109911678115799, + "grad_norm": 1.0257177352905273, + "learning_rate": 7.748644841687486e-05, + "loss": 0.0722, + "step": 62820 + }, + { + "epoch": 4.11056591429506, + "grad_norm": 0.7566149234771729, + "learning_rate": 7.747877453876901e-05, + "loss": 0.0667, + "step": 62830 + }, + { + "epoch": 4.111220150474321, + "grad_norm": 0.9833534955978394, + "learning_rate": 7.747109973318242e-05, + "loss": 0.0588, + "step": 62840 + }, + { + "epoch": 4.111874386653582, + "grad_norm": 0.9134366512298584, + "learning_rate": 7.746342400037417e-05, + "loss": 0.0701, + "step": 62850 + }, + { + "epoch": 4.112528622832842, + "grad_norm": 0.8957915902137756, + "learning_rate": 7.745574734060335e-05, + "loss": 0.0588, + "step": 62860 + }, + { + "epoch": 4.113182859012103, + "grad_norm": 0.7879095077514648, + "learning_rate": 7.744806975412904e-05, + "loss": 0.0669, + "step": 62870 + }, + { + "epoch": 4.113837095191364, + "grad_norm": 0.9840167760848999, + "learning_rate": 7.744039124121039e-05, + "loss": 0.0612, + "step": 62880 + }, + { + "epoch": 4.114491331370624, + "grad_norm": 0.656774640083313, + "learning_rate": 7.743271180210657e-05, + "loss": 0.0671, + "step": 62890 + }, + { + "epoch": 4.115145567549885, + "grad_norm": 0.6617786884307861, + "learning_rate": 7.742503143707679e-05, + "loss": 0.0633, + "step": 62900 + }, + { + "epoch": 4.115799803729146, + "grad_norm": 0.9550051093101501, + "learning_rate": 7.741735014638027e-05, + "loss": 0.0704, + "step": 62910 + }, + { + "epoch": 4.116454039908407, + "grad_norm": 0.7846367359161377, + "learning_rate": 7.740966793027626e-05, + "loss": 0.0664, + "step": 62920 + }, + { + "epoch": 4.117108276087667, + "grad_norm": 1.068386435508728, + "learning_rate": 7.740198478902409e-05, + "loss": 0.068, + "step": 62930 + }, + { + "epoch": 4.117762512266928, + "grad_norm": 1.1259160041809082, + "learning_rate": 7.739430072288309e-05, + "loss": 0.0667, + "step": 62940 + }, + { + "epoch": 4.118416748446189, + "grad_norm": 1.1424773931503296, + "learning_rate": 7.738661573211256e-05, + "loss": 0.0688, + "step": 62950 + }, + { + "epoch": 4.119070984625449, + "grad_norm": 0.8278184533119202, + "learning_rate": 7.737892981697194e-05, + "loss": 0.0603, + "step": 62960 + }, + { + "epoch": 4.11972522080471, + "grad_norm": 0.7246772646903992, + "learning_rate": 7.737124297772065e-05, + "loss": 0.0647, + "step": 62970 + }, + { + "epoch": 4.120379456983971, + "grad_norm": 0.9246161580085754, + "learning_rate": 7.736355521461811e-05, + "loss": 0.0655, + "step": 62980 + }, + { + "epoch": 4.121033693163232, + "grad_norm": 0.9616913199424744, + "learning_rate": 7.735586652792382e-05, + "loss": 0.0727, + "step": 62990 + }, + { + "epoch": 4.121687929342492, + "grad_norm": 0.8557519912719727, + "learning_rate": 7.734817691789729e-05, + "loss": 0.066, + "step": 63000 + }, + { + "epoch": 4.122342165521753, + "grad_norm": 0.8217513561248779, + "learning_rate": 7.734048638479807e-05, + "loss": 0.0749, + "step": 63010 + }, + { + "epoch": 4.122996401701014, + "grad_norm": 1.1255154609680176, + "learning_rate": 7.733279492888572e-05, + "loss": 0.0753, + "step": 63020 + }, + { + "epoch": 4.123650637880274, + "grad_norm": 0.9445163607597351, + "learning_rate": 7.732510255041985e-05, + "loss": 0.0624, + "step": 63030 + }, + { + "epoch": 4.124304874059535, + "grad_norm": 0.9506664276123047, + "learning_rate": 7.731740924966014e-05, + "loss": 0.067, + "step": 63040 + }, + { + "epoch": 4.124959110238796, + "grad_norm": 0.9376375079154968, + "learning_rate": 7.730971502686621e-05, + "loss": 0.0673, + "step": 63050 + }, + { + "epoch": 4.125613346418057, + "grad_norm": 0.8614791035652161, + "learning_rate": 7.730201988229777e-05, + "loss": 0.0703, + "step": 63060 + }, + { + "epoch": 4.126267582597317, + "grad_norm": 0.8429438471794128, + "learning_rate": 7.729432381621455e-05, + "loss": 0.0619, + "step": 63070 + }, + { + "epoch": 4.126921818776578, + "grad_norm": 1.0551830530166626, + "learning_rate": 7.728662682887633e-05, + "loss": 0.0709, + "step": 63080 + }, + { + "epoch": 4.127576054955839, + "grad_norm": 0.9198732376098633, + "learning_rate": 7.727892892054289e-05, + "loss": 0.0647, + "step": 63090 + }, + { + "epoch": 4.128230291135099, + "grad_norm": 0.8567238450050354, + "learning_rate": 7.727123009147406e-05, + "loss": 0.0637, + "step": 63100 + }, + { + "epoch": 4.12888452731436, + "grad_norm": 0.947485089302063, + "learning_rate": 7.72635303419297e-05, + "loss": 0.0737, + "step": 63110 + }, + { + "epoch": 4.129538763493621, + "grad_norm": 0.8637893199920654, + "learning_rate": 7.725582967216966e-05, + "loss": 0.0712, + "step": 63120 + }, + { + "epoch": 4.130192999672882, + "grad_norm": 1.1613445281982422, + "learning_rate": 7.724812808245392e-05, + "loss": 0.0756, + "step": 63130 + }, + { + "epoch": 4.130847235852142, + "grad_norm": 0.8602867722511292, + "learning_rate": 7.724042557304238e-05, + "loss": 0.067, + "step": 63140 + }, + { + "epoch": 4.131501472031403, + "grad_norm": 0.8593533635139465, + "learning_rate": 7.723272214419506e-05, + "loss": 0.0615, + "step": 63150 + }, + { + "epoch": 4.132155708210664, + "grad_norm": 0.9966337084770203, + "learning_rate": 7.722501779617193e-05, + "loss": 0.0593, + "step": 63160 + }, + { + "epoch": 4.132809944389924, + "grad_norm": 0.997571587562561, + "learning_rate": 7.721731252923305e-05, + "loss": 0.0681, + "step": 63170 + }, + { + "epoch": 4.133464180569185, + "grad_norm": 0.94794100522995, + "learning_rate": 7.720960634363848e-05, + "loss": 0.0818, + "step": 63180 + }, + { + "epoch": 4.134118416748446, + "grad_norm": 0.8688943982124329, + "learning_rate": 7.720189923964833e-05, + "loss": 0.0653, + "step": 63190 + }, + { + "epoch": 4.1347726529277065, + "grad_norm": 0.8499542474746704, + "learning_rate": 7.719419121752277e-05, + "loss": 0.0734, + "step": 63200 + }, + { + "epoch": 4.135426889106967, + "grad_norm": 0.8392613530158997, + "learning_rate": 7.718648227752192e-05, + "loss": 0.0738, + "step": 63210 + }, + { + "epoch": 4.136081125286228, + "grad_norm": 1.165153980255127, + "learning_rate": 7.7178772419906e-05, + "loss": 0.0717, + "step": 63220 + }, + { + "epoch": 4.136735361465489, + "grad_norm": 0.7674516439437866, + "learning_rate": 7.717106164493523e-05, + "loss": 0.0682, + "step": 63230 + }, + { + "epoch": 4.1373895976447495, + "grad_norm": 0.8776054978370667, + "learning_rate": 7.716334995286988e-05, + "loss": 0.0712, + "step": 63240 + }, + { + "epoch": 4.13804383382401, + "grad_norm": 1.0000853538513184, + "learning_rate": 7.715563734397022e-05, + "loss": 0.0649, + "step": 63250 + }, + { + "epoch": 4.138698070003271, + "grad_norm": 0.9027912616729736, + "learning_rate": 7.714792381849658e-05, + "loss": 0.0746, + "step": 63260 + }, + { + "epoch": 4.1393523061825315, + "grad_norm": 1.001630425453186, + "learning_rate": 7.714020937670931e-05, + "loss": 0.0703, + "step": 63270 + }, + { + "epoch": 4.140006542361792, + "grad_norm": 0.9271620512008667, + "learning_rate": 7.713249401886882e-05, + "loss": 0.0685, + "step": 63280 + }, + { + "epoch": 4.140660778541053, + "grad_norm": 0.795330822467804, + "learning_rate": 7.712477774523547e-05, + "loss": 0.0696, + "step": 63290 + }, + { + "epoch": 4.141315014720314, + "grad_norm": 0.7591599225997925, + "learning_rate": 7.711706055606975e-05, + "loss": 0.063, + "step": 63300 + }, + { + "epoch": 4.1419692508995745, + "grad_norm": 0.9598954319953918, + "learning_rate": 7.710934245163211e-05, + "loss": 0.0658, + "step": 63310 + }, + { + "epoch": 4.142623487078835, + "grad_norm": 0.6453430652618408, + "learning_rate": 7.710162343218307e-05, + "loss": 0.0666, + "step": 63320 + }, + { + "epoch": 4.143277723258096, + "grad_norm": 0.8312489986419678, + "learning_rate": 7.709390349798315e-05, + "loss": 0.0651, + "step": 63330 + }, + { + "epoch": 4.1439319594373565, + "grad_norm": 0.7793470025062561, + "learning_rate": 7.708618264929295e-05, + "loss": 0.0713, + "step": 63340 + }, + { + "epoch": 4.1445861956166175, + "grad_norm": 0.827540397644043, + "learning_rate": 7.707846088637305e-05, + "loss": 0.0641, + "step": 63350 + }, + { + "epoch": 4.145240431795878, + "grad_norm": 0.8500876426696777, + "learning_rate": 7.707073820948407e-05, + "loss": 0.0698, + "step": 63360 + }, + { + "epoch": 4.145894667975139, + "grad_norm": 0.7591197490692139, + "learning_rate": 7.706301461888667e-05, + "loss": 0.0623, + "step": 63370 + }, + { + "epoch": 4.1465489041543995, + "grad_norm": 0.7708221077919006, + "learning_rate": 7.705529011484159e-05, + "loss": 0.0595, + "step": 63380 + }, + { + "epoch": 4.14720314033366, + "grad_norm": 0.8088019490242004, + "learning_rate": 7.704756469760947e-05, + "loss": 0.0609, + "step": 63390 + }, + { + "epoch": 4.147857376512921, + "grad_norm": 0.8570855855941772, + "learning_rate": 7.703983836745112e-05, + "loss": 0.0717, + "step": 63400 + }, + { + "epoch": 4.1485116126921815, + "grad_norm": 0.7673255801200867, + "learning_rate": 7.703211112462731e-05, + "loss": 0.06, + "step": 63410 + }, + { + "epoch": 4.1491658488714425, + "grad_norm": 0.9685210585594177, + "learning_rate": 7.702438296939887e-05, + "loss": 0.0609, + "step": 63420 + }, + { + "epoch": 4.149820085050703, + "grad_norm": 0.7904552817344666, + "learning_rate": 7.701665390202661e-05, + "loss": 0.0636, + "step": 63430 + }, + { + "epoch": 4.150474321229964, + "grad_norm": 0.8309029936790466, + "learning_rate": 7.700892392277144e-05, + "loss": 0.0626, + "step": 63440 + }, + { + "epoch": 4.1511285574092245, + "grad_norm": 0.7211902141571045, + "learning_rate": 7.700119303189424e-05, + "loss": 0.0626, + "step": 63450 + }, + { + "epoch": 4.1517827935884855, + "grad_norm": 0.7313457131385803, + "learning_rate": 7.699346122965599e-05, + "loss": 0.0544, + "step": 63460 + }, + { + "epoch": 4.152437029767746, + "grad_norm": 0.9150106906890869, + "learning_rate": 7.698572851631761e-05, + "loss": 0.0679, + "step": 63470 + }, + { + "epoch": 4.1530912659470065, + "grad_norm": 1.2330702543258667, + "learning_rate": 7.69779948921401e-05, + "loss": 0.0696, + "step": 63480 + }, + { + "epoch": 4.1537455021262675, + "grad_norm": 0.9700013995170593, + "learning_rate": 7.697026035738454e-05, + "loss": 0.0679, + "step": 63490 + }, + { + "epoch": 4.1543997383055284, + "grad_norm": 0.8161741495132446, + "learning_rate": 7.696252491231197e-05, + "loss": 0.0617, + "step": 63500 + }, + { + "epoch": 4.155053974484789, + "grad_norm": 0.9676251411437988, + "learning_rate": 7.695478855718344e-05, + "loss": 0.0713, + "step": 63510 + }, + { + "epoch": 4.1557082106640495, + "grad_norm": 0.9095385074615479, + "learning_rate": 7.694705129226012e-05, + "loss": 0.063, + "step": 63520 + }, + { + "epoch": 4.1563624468433105, + "grad_norm": 0.7652897238731384, + "learning_rate": 7.693931311780315e-05, + "loss": 0.0679, + "step": 63530 + }, + { + "epoch": 4.157016683022571, + "grad_norm": 0.8652570247650146, + "learning_rate": 7.693157403407372e-05, + "loss": 0.0624, + "step": 63540 + }, + { + "epoch": 4.1576709192018315, + "grad_norm": 0.7441179752349854, + "learning_rate": 7.692383404133301e-05, + "loss": 0.0568, + "step": 63550 + }, + { + "epoch": 4.1583251553810925, + "grad_norm": 0.8349494338035583, + "learning_rate": 7.691609313984232e-05, + "loss": 0.0621, + "step": 63560 + }, + { + "epoch": 4.1589793915603535, + "grad_norm": 0.8901420831680298, + "learning_rate": 7.690835132986287e-05, + "loss": 0.0687, + "step": 63570 + }, + { + "epoch": 4.159633627739614, + "grad_norm": 0.8408117890357971, + "learning_rate": 7.690060861165601e-05, + "loss": 0.0641, + "step": 63580 + }, + { + "epoch": 4.1602878639188745, + "grad_norm": 0.8637434244155884, + "learning_rate": 7.689286498548304e-05, + "loss": 0.0681, + "step": 63590 + }, + { + "epoch": 4.1609421000981355, + "grad_norm": 0.8616120219230652, + "learning_rate": 7.688512045160538e-05, + "loss": 0.0649, + "step": 63600 + }, + { + "epoch": 4.1615963362773964, + "grad_norm": 0.8039658069610596, + "learning_rate": 7.687737501028438e-05, + "loss": 0.0656, + "step": 63610 + }, + { + "epoch": 4.1622505724566565, + "grad_norm": 0.9429184198379517, + "learning_rate": 7.686962866178147e-05, + "loss": 0.0668, + "step": 63620 + }, + { + "epoch": 4.1629048086359175, + "grad_norm": 0.9747768044471741, + "learning_rate": 7.686188140635815e-05, + "loss": 0.0672, + "step": 63630 + }, + { + "epoch": 4.1635590448151785, + "grad_norm": 1.0797864198684692, + "learning_rate": 7.685413324427588e-05, + "loss": 0.0729, + "step": 63640 + }, + { + "epoch": 4.1642132809944385, + "grad_norm": 1.0528181791305542, + "learning_rate": 7.684638417579617e-05, + "loss": 0.0679, + "step": 63650 + }, + { + "epoch": 4.1648675171736995, + "grad_norm": 1.1309136152267456, + "learning_rate": 7.68386342011806e-05, + "loss": 0.0744, + "step": 63660 + }, + { + "epoch": 4.1655217533529605, + "grad_norm": 1.1569007635116577, + "learning_rate": 7.683088332069073e-05, + "loss": 0.0766, + "step": 63670 + }, + { + "epoch": 4.1661759895322215, + "grad_norm": 0.8142968416213989, + "learning_rate": 7.682313153458817e-05, + "loss": 0.0634, + "step": 63680 + }, + { + "epoch": 4.1668302257114815, + "grad_norm": 1.0308458805084229, + "learning_rate": 7.68153788431346e-05, + "loss": 0.0671, + "step": 63690 + }, + { + "epoch": 4.1674844618907425, + "grad_norm": 0.8902773857116699, + "learning_rate": 7.680762524659167e-05, + "loss": 0.0575, + "step": 63700 + }, + { + "epoch": 4.1681386980700035, + "grad_norm": 1.0067930221557617, + "learning_rate": 7.679987074522107e-05, + "loss": 0.0612, + "step": 63710 + }, + { + "epoch": 4.168792934249264, + "grad_norm": 0.8824333548545837, + "learning_rate": 7.679211533928454e-05, + "loss": 0.0631, + "step": 63720 + }, + { + "epoch": 4.1694471704285245, + "grad_norm": 0.8123067021369934, + "learning_rate": 7.678435902904386e-05, + "loss": 0.0651, + "step": 63730 + }, + { + "epoch": 4.1701014066077855, + "grad_norm": 1.0830172300338745, + "learning_rate": 7.677660181476081e-05, + "loss": 0.0693, + "step": 63740 + }, + { + "epoch": 4.1707556427870465, + "grad_norm": 0.9839672446250916, + "learning_rate": 7.676884369669723e-05, + "loss": 0.0662, + "step": 63750 + }, + { + "epoch": 4.1714098789663066, + "grad_norm": 0.941388726234436, + "learning_rate": 7.676108467511498e-05, + "loss": 0.066, + "step": 63760 + }, + { + "epoch": 4.1720641151455675, + "grad_norm": 0.8638181090354919, + "learning_rate": 7.675332475027593e-05, + "loss": 0.0714, + "step": 63770 + }, + { + "epoch": 4.1727183513248285, + "grad_norm": 0.8280830979347229, + "learning_rate": 7.674556392244201e-05, + "loss": 0.0676, + "step": 63780 + }, + { + "epoch": 4.173372587504089, + "grad_norm": 0.7582976222038269, + "learning_rate": 7.673780219187518e-05, + "loss": 0.0591, + "step": 63790 + }, + { + "epoch": 4.1740268236833495, + "grad_norm": 0.7644749283790588, + "learning_rate": 7.673003955883737e-05, + "loss": 0.058, + "step": 63800 + }, + { + "epoch": 4.1746810598626105, + "grad_norm": 0.88578200340271, + "learning_rate": 7.672227602359064e-05, + "loss": 0.069, + "step": 63810 + }, + { + "epoch": 4.1753352960418715, + "grad_norm": 0.8269320726394653, + "learning_rate": 7.671451158639702e-05, + "loss": 0.0553, + "step": 63820 + }, + { + "epoch": 4.175989532221132, + "grad_norm": 0.791187584400177, + "learning_rate": 7.670674624751857e-05, + "loss": 0.0666, + "step": 63830 + }, + { + "epoch": 4.1766437684003925, + "grad_norm": 0.8734759092330933, + "learning_rate": 7.669898000721738e-05, + "loss": 0.0613, + "step": 63840 + }, + { + "epoch": 4.1772980045796535, + "grad_norm": 0.9316699504852295, + "learning_rate": 7.66912128657556e-05, + "loss": 0.0632, + "step": 63850 + }, + { + "epoch": 4.177952240758914, + "grad_norm": 0.9909458160400391, + "learning_rate": 7.668344482339539e-05, + "loss": 0.0668, + "step": 63860 + }, + { + "epoch": 4.1786064769381746, + "grad_norm": 1.1051509380340576, + "learning_rate": 7.667567588039895e-05, + "loss": 0.0661, + "step": 63870 + }, + { + "epoch": 4.1792607131174355, + "grad_norm": 0.8877078890800476, + "learning_rate": 7.666790603702846e-05, + "loss": 0.0608, + "step": 63880 + }, + { + "epoch": 4.1799149492966965, + "grad_norm": 0.7553628087043762, + "learning_rate": 7.666013529354621e-05, + "loss": 0.0662, + "step": 63890 + }, + { + "epoch": 4.180569185475957, + "grad_norm": 0.8676695823669434, + "learning_rate": 7.665236365021448e-05, + "loss": 0.0669, + "step": 63900 + }, + { + "epoch": 4.1812234216552175, + "grad_norm": 0.7713704109191895, + "learning_rate": 7.664459110729558e-05, + "loss": 0.0642, + "step": 63910 + }, + { + "epoch": 4.1818776578344785, + "grad_norm": 1.018075704574585, + "learning_rate": 7.663681766505187e-05, + "loss": 0.0665, + "step": 63920 + }, + { + "epoch": 4.182531894013739, + "grad_norm": 0.94621342420578, + "learning_rate": 7.662904332374567e-05, + "loss": 0.0613, + "step": 63930 + }, + { + "epoch": 4.183186130193, + "grad_norm": 0.9044004678726196, + "learning_rate": 7.662126808363946e-05, + "loss": 0.0602, + "step": 63940 + }, + { + "epoch": 4.1838403663722605, + "grad_norm": 0.8055490255355835, + "learning_rate": 7.661349194499561e-05, + "loss": 0.0643, + "step": 63950 + }, + { + "epoch": 4.1844946025515215, + "grad_norm": 0.9511623382568359, + "learning_rate": 7.660571490807662e-05, + "loss": 0.0748, + "step": 63960 + }, + { + "epoch": 4.185148838730782, + "grad_norm": 0.8675695061683655, + "learning_rate": 7.659793697314496e-05, + "loss": 0.0609, + "step": 63970 + }, + { + "epoch": 4.1858030749100426, + "grad_norm": 0.9198430180549622, + "learning_rate": 7.659015814046318e-05, + "loss": 0.0718, + "step": 63980 + }, + { + "epoch": 4.1864573110893035, + "grad_norm": 0.8027588129043579, + "learning_rate": 7.658237841029383e-05, + "loss": 0.0562, + "step": 63990 + }, + { + "epoch": 4.187111547268564, + "grad_norm": 0.7004244327545166, + "learning_rate": 7.657459778289949e-05, + "loss": 0.0602, + "step": 64000 + }, + { + "epoch": 4.187765783447825, + "grad_norm": 0.8982917070388794, + "learning_rate": 7.656681625854278e-05, + "loss": 0.0646, + "step": 64010 + }, + { + "epoch": 4.1884200196270855, + "grad_norm": 0.9718723297119141, + "learning_rate": 7.655903383748637e-05, + "loss": 0.0811, + "step": 64020 + }, + { + "epoch": 4.1890742558063465, + "grad_norm": 0.9556106925010681, + "learning_rate": 7.655125051999289e-05, + "loss": 0.0666, + "step": 64030 + }, + { + "epoch": 4.189728491985607, + "grad_norm": 0.8579035997390747, + "learning_rate": 7.654346630632507e-05, + "loss": 0.0681, + "step": 64040 + }, + { + "epoch": 4.190382728164868, + "grad_norm": 0.8351026177406311, + "learning_rate": 7.653568119674567e-05, + "loss": 0.0722, + "step": 64050 + }, + { + "epoch": 4.1910369643441285, + "grad_norm": 0.8565570116043091, + "learning_rate": 7.652789519151741e-05, + "loss": 0.0584, + "step": 64060 + }, + { + "epoch": 4.191691200523389, + "grad_norm": 0.9618107080459595, + "learning_rate": 7.652010829090312e-05, + "loss": 0.0592, + "step": 64070 + }, + { + "epoch": 4.19234543670265, + "grad_norm": 0.6823198199272156, + "learning_rate": 7.651232049516566e-05, + "loss": 0.0549, + "step": 64080 + }, + { + "epoch": 4.1929996728819106, + "grad_norm": 0.9959099292755127, + "learning_rate": 7.650453180456783e-05, + "loss": 0.068, + "step": 64090 + }, + { + "epoch": 4.193653909061171, + "grad_norm": 0.767525315284729, + "learning_rate": 7.649674221937252e-05, + "loss": 0.0582, + "step": 64100 + }, + { + "epoch": 4.194308145240432, + "grad_norm": 1.036657452583313, + "learning_rate": 7.64889517398427e-05, + "loss": 0.073, + "step": 64110 + }, + { + "epoch": 4.194962381419693, + "grad_norm": 1.0274145603179932, + "learning_rate": 7.648116036624126e-05, + "loss": 0.0697, + "step": 64120 + }, + { + "epoch": 4.1956166175989535, + "grad_norm": 0.8063427209854126, + "learning_rate": 7.647336809883124e-05, + "loss": 0.0559, + "step": 64130 + }, + { + "epoch": 4.196270853778214, + "grad_norm": 0.8468816876411438, + "learning_rate": 7.646557493787558e-05, + "loss": 0.0635, + "step": 64140 + }, + { + "epoch": 4.196925089957475, + "grad_norm": 0.9064197540283203, + "learning_rate": 7.645778088363738e-05, + "loss": 0.0647, + "step": 64150 + }, + { + "epoch": 4.197579326136736, + "grad_norm": 0.9117610454559326, + "learning_rate": 7.644998593637968e-05, + "loss": 0.0668, + "step": 64160 + }, + { + "epoch": 4.198233562315996, + "grad_norm": 0.6964724063873291, + "learning_rate": 7.64421900963656e-05, + "loss": 0.064, + "step": 64170 + }, + { + "epoch": 4.198887798495257, + "grad_norm": 0.8209680318832397, + "learning_rate": 7.643439336385824e-05, + "loss": 0.0636, + "step": 64180 + }, + { + "epoch": 4.199542034674518, + "grad_norm": 0.8586738705635071, + "learning_rate": 7.642659573912078e-05, + "loss": 0.0644, + "step": 64190 + }, + { + "epoch": 4.200196270853779, + "grad_norm": 0.8765914440155029, + "learning_rate": 7.641879722241643e-05, + "loss": 0.0718, + "step": 64200 + }, + { + "epoch": 4.200850507033039, + "grad_norm": 0.9443649649620056, + "learning_rate": 7.641099781400838e-05, + "loss": 0.0835, + "step": 64210 + }, + { + "epoch": 4.2015047432123, + "grad_norm": 1.0230027437210083, + "learning_rate": 7.640319751415987e-05, + "loss": 0.0697, + "step": 64220 + }, + { + "epoch": 4.202158979391561, + "grad_norm": 1.0767507553100586, + "learning_rate": 7.63953963231342e-05, + "loss": 0.0671, + "step": 64230 + }, + { + "epoch": 4.202813215570821, + "grad_norm": 0.9476938247680664, + "learning_rate": 7.63875942411947e-05, + "loss": 0.0624, + "step": 64240 + }, + { + "epoch": 4.203467451750082, + "grad_norm": 0.9079118967056274, + "learning_rate": 7.637979126860468e-05, + "loss": 0.0665, + "step": 64250 + }, + { + "epoch": 4.204121687929343, + "grad_norm": 0.7623441219329834, + "learning_rate": 7.637198740562752e-05, + "loss": 0.0564, + "step": 64260 + }, + { + "epoch": 4.204775924108604, + "grad_norm": 0.7987393736839294, + "learning_rate": 7.636418265252662e-05, + "loss": 0.0642, + "step": 64270 + }, + { + "epoch": 4.205430160287864, + "grad_norm": 0.7768983840942383, + "learning_rate": 7.635637700956542e-05, + "loss": 0.0689, + "step": 64280 + }, + { + "epoch": 4.206084396467125, + "grad_norm": 0.9263744354248047, + "learning_rate": 7.634857047700737e-05, + "loss": 0.0591, + "step": 64290 + }, + { + "epoch": 4.206738632646386, + "grad_norm": 0.9548198580741882, + "learning_rate": 7.634076305511598e-05, + "loss": 0.0626, + "step": 64300 + }, + { + "epoch": 4.207392868825646, + "grad_norm": 0.9324436783790588, + "learning_rate": 7.633295474415473e-05, + "loss": 0.062, + "step": 64310 + }, + { + "epoch": 4.208047105004907, + "grad_norm": 0.9058679342269897, + "learning_rate": 7.63251455443872e-05, + "loss": 0.0726, + "step": 64320 + }, + { + "epoch": 4.208701341184168, + "grad_norm": 0.6921377778053284, + "learning_rate": 7.631733545607697e-05, + "loss": 0.0582, + "step": 64330 + }, + { + "epoch": 4.209355577363429, + "grad_norm": 0.7973593473434448, + "learning_rate": 7.630952447948765e-05, + "loss": 0.0667, + "step": 64340 + }, + { + "epoch": 4.210009813542689, + "grad_norm": 0.8190951943397522, + "learning_rate": 7.630171261488289e-05, + "loss": 0.0559, + "step": 64350 + }, + { + "epoch": 4.21066404972195, + "grad_norm": 0.9391740560531616, + "learning_rate": 7.629389986252634e-05, + "loss": 0.0589, + "step": 64360 + }, + { + "epoch": 4.211318285901211, + "grad_norm": 0.8340182304382324, + "learning_rate": 7.628608622268171e-05, + "loss": 0.0663, + "step": 64370 + }, + { + "epoch": 4.211972522080471, + "grad_norm": 0.9111093878746033, + "learning_rate": 7.627827169561275e-05, + "loss": 0.0664, + "step": 64380 + }, + { + "epoch": 4.212626758259732, + "grad_norm": 0.8605757355690002, + "learning_rate": 7.627045628158318e-05, + "loss": 0.0681, + "step": 64390 + }, + { + "epoch": 4.213280994438993, + "grad_norm": 0.9816390872001648, + "learning_rate": 7.626263998085683e-05, + "loss": 0.0607, + "step": 64400 + }, + { + "epoch": 4.213935230618254, + "grad_norm": 0.8772796988487244, + "learning_rate": 7.625482279369749e-05, + "loss": 0.0548, + "step": 64410 + }, + { + "epoch": 4.214589466797514, + "grad_norm": 0.8973631858825684, + "learning_rate": 7.624700472036904e-05, + "loss": 0.0561, + "step": 64420 + }, + { + "epoch": 4.215243702976775, + "grad_norm": 1.0034244060516357, + "learning_rate": 7.623918576113533e-05, + "loss": 0.0648, + "step": 64430 + }, + { + "epoch": 4.215897939156036, + "grad_norm": 0.8082411885261536, + "learning_rate": 7.62313659162603e-05, + "loss": 0.0651, + "step": 64440 + }, + { + "epoch": 4.216552175335296, + "grad_norm": 0.8607401847839355, + "learning_rate": 7.622354518600786e-05, + "loss": 0.063, + "step": 64450 + }, + { + "epoch": 4.217206411514557, + "grad_norm": 1.0780029296875, + "learning_rate": 7.621572357064202e-05, + "loss": 0.0556, + "step": 64460 + }, + { + "epoch": 4.217860647693818, + "grad_norm": 0.8647873997688293, + "learning_rate": 7.620790107042674e-05, + "loss": 0.0654, + "step": 64470 + }, + { + "epoch": 4.218514883873079, + "grad_norm": 0.9308034181594849, + "learning_rate": 7.620007768562606e-05, + "loss": 0.0719, + "step": 64480 + }, + { + "epoch": 4.219169120052339, + "grad_norm": 1.1081721782684326, + "learning_rate": 7.619225341650404e-05, + "loss": 0.0765, + "step": 64490 + }, + { + "epoch": 4.2198233562316, + "grad_norm": 0.7883846163749695, + "learning_rate": 7.618442826332482e-05, + "loss": 0.0683, + "step": 64500 + }, + { + "epoch": 4.220477592410861, + "grad_norm": 0.6897490620613098, + "learning_rate": 7.617660222635243e-05, + "loss": 0.0579, + "step": 64510 + }, + { + "epoch": 4.221131828590121, + "grad_norm": 0.7844201326370239, + "learning_rate": 7.616877530585107e-05, + "loss": 0.0584, + "step": 64520 + }, + { + "epoch": 4.221786064769382, + "grad_norm": 0.8688981533050537, + "learning_rate": 7.616094750208493e-05, + "loss": 0.0738, + "step": 64530 + }, + { + "epoch": 4.222440300948643, + "grad_norm": 0.886189341545105, + "learning_rate": 7.61531188153182e-05, + "loss": 0.0587, + "step": 64540 + }, + { + "epoch": 4.223094537127903, + "grad_norm": 1.0094472169876099, + "learning_rate": 7.61452892458151e-05, + "loss": 0.0697, + "step": 64550 + }, + { + "epoch": 4.223748773307164, + "grad_norm": 1.0703370571136475, + "learning_rate": 7.613745879383995e-05, + "loss": 0.0695, + "step": 64560 + }, + { + "epoch": 4.224403009486425, + "grad_norm": 0.8577084541320801, + "learning_rate": 7.612962745965699e-05, + "loss": 0.0613, + "step": 64570 + }, + { + "epoch": 4.225057245665686, + "grad_norm": 0.6969327330589294, + "learning_rate": 7.612179524353058e-05, + "loss": 0.0734, + "step": 64580 + }, + { + "epoch": 4.225711481844946, + "grad_norm": 0.8476243019104004, + "learning_rate": 7.611396214572508e-05, + "loss": 0.0715, + "step": 64590 + }, + { + "epoch": 4.226365718024207, + "grad_norm": 0.8455610871315002, + "learning_rate": 7.610612816650488e-05, + "loss": 0.0605, + "step": 64600 + }, + { + "epoch": 4.227019954203468, + "grad_norm": 1.1364946365356445, + "learning_rate": 7.609829330613439e-05, + "loss": 0.0741, + "step": 64610 + }, + { + "epoch": 4.227674190382728, + "grad_norm": 1.1362121105194092, + "learning_rate": 7.609045756487805e-05, + "loss": 0.0667, + "step": 64620 + }, + { + "epoch": 4.228328426561989, + "grad_norm": 1.0451109409332275, + "learning_rate": 7.608262094300034e-05, + "loss": 0.0725, + "step": 64630 + }, + { + "epoch": 4.22898266274125, + "grad_norm": 0.9875683188438416, + "learning_rate": 7.607478344076577e-05, + "loss": 0.057, + "step": 64640 + }, + { + "epoch": 4.229636898920511, + "grad_norm": 1.0684120655059814, + "learning_rate": 7.606694505843887e-05, + "loss": 0.0645, + "step": 64650 + }, + { + "epoch": 4.230291135099771, + "grad_norm": 1.1099648475646973, + "learning_rate": 7.605910579628421e-05, + "loss": 0.0743, + "step": 64660 + }, + { + "epoch": 4.230945371279032, + "grad_norm": 1.0001264810562134, + "learning_rate": 7.60512656545664e-05, + "loss": 0.0664, + "step": 64670 + }, + { + "epoch": 4.231599607458293, + "grad_norm": 0.8117485046386719, + "learning_rate": 7.604342463355003e-05, + "loss": 0.0688, + "step": 64680 + }, + { + "epoch": 4.232253843637553, + "grad_norm": 0.9693712592124939, + "learning_rate": 7.60355827334998e-05, + "loss": 0.0747, + "step": 64690 + }, + { + "epoch": 4.232908079816814, + "grad_norm": 0.9077291488647461, + "learning_rate": 7.602773995468036e-05, + "loss": 0.0593, + "step": 64700 + }, + { + "epoch": 4.233562315996075, + "grad_norm": 1.1760419607162476, + "learning_rate": 7.601989629735643e-05, + "loss": 0.076, + "step": 64710 + }, + { + "epoch": 4.234216552175336, + "grad_norm": 0.9444860219955444, + "learning_rate": 7.601205176179279e-05, + "loss": 0.0658, + "step": 64720 + }, + { + "epoch": 4.234870788354596, + "grad_norm": 0.9393828511238098, + "learning_rate": 7.600420634825416e-05, + "loss": 0.0571, + "step": 64730 + }, + { + "epoch": 4.235525024533857, + "grad_norm": 1.003543496131897, + "learning_rate": 7.599636005700537e-05, + "loss": 0.0668, + "step": 64740 + }, + { + "epoch": 4.236179260713118, + "grad_norm": 0.8727512955665588, + "learning_rate": 7.598851288831124e-05, + "loss": 0.0624, + "step": 64750 + }, + { + "epoch": 4.236833496892378, + "grad_norm": 1.0206762552261353, + "learning_rate": 7.598066484243667e-05, + "loss": 0.0789, + "step": 64760 + }, + { + "epoch": 4.237487733071639, + "grad_norm": 1.1758549213409424, + "learning_rate": 7.597281591964649e-05, + "loss": 0.0747, + "step": 64770 + }, + { + "epoch": 4.2381419692509, + "grad_norm": 0.7732208371162415, + "learning_rate": 7.596496612020567e-05, + "loss": 0.0596, + "step": 64780 + }, + { + "epoch": 4.238796205430161, + "grad_norm": 0.8588206768035889, + "learning_rate": 7.595711544437917e-05, + "loss": 0.063, + "step": 64790 + }, + { + "epoch": 4.239450441609421, + "grad_norm": 0.811578094959259, + "learning_rate": 7.594926389243193e-05, + "loss": 0.0732, + "step": 64800 + }, + { + "epoch": 4.240104677788682, + "grad_norm": 0.7321622371673584, + "learning_rate": 7.594141146462897e-05, + "loss": 0.0645, + "step": 64810 + }, + { + "epoch": 4.240758913967943, + "grad_norm": 0.690522313117981, + "learning_rate": 7.593355816123535e-05, + "loss": 0.0696, + "step": 64820 + }, + { + "epoch": 4.241413150147203, + "grad_norm": 0.8633084297180176, + "learning_rate": 7.592570398251614e-05, + "loss": 0.0669, + "step": 64830 + }, + { + "epoch": 4.242067386326464, + "grad_norm": 0.8941366076469421, + "learning_rate": 7.591784892873642e-05, + "loss": 0.0604, + "step": 64840 + }, + { + "epoch": 4.242721622505725, + "grad_norm": 0.9090977311134338, + "learning_rate": 7.590999300016131e-05, + "loss": 0.0656, + "step": 64850 + }, + { + "epoch": 4.243375858684986, + "grad_norm": 0.734219491481781, + "learning_rate": 7.5902136197056e-05, + "loss": 0.0589, + "step": 64860 + }, + { + "epoch": 4.244030094864246, + "grad_norm": 0.9445002675056458, + "learning_rate": 7.589427851968567e-05, + "loss": 0.0696, + "step": 64870 + }, + { + "epoch": 4.244684331043507, + "grad_norm": 0.8005701899528503, + "learning_rate": 7.58864199683155e-05, + "loss": 0.0658, + "step": 64880 + }, + { + "epoch": 4.245338567222768, + "grad_norm": 1.0320810079574585, + "learning_rate": 7.58785605432108e-05, + "loss": 0.0689, + "step": 64890 + }, + { + "epoch": 4.245992803402028, + "grad_norm": 0.820569634437561, + "learning_rate": 7.58707002446368e-05, + "loss": 0.0665, + "step": 64900 + }, + { + "epoch": 4.246647039581289, + "grad_norm": 0.8624978065490723, + "learning_rate": 7.58628390728588e-05, + "loss": 0.0674, + "step": 64910 + }, + { + "epoch": 4.24730127576055, + "grad_norm": 0.9039542078971863, + "learning_rate": 7.58549770281422e-05, + "loss": 0.0871, + "step": 64920 + }, + { + "epoch": 4.247955511939811, + "grad_norm": 0.8915390968322754, + "learning_rate": 7.584711411075227e-05, + "loss": 0.0585, + "step": 64930 + }, + { + "epoch": 4.248609748119071, + "grad_norm": 0.9667330980300903, + "learning_rate": 7.583925032095447e-05, + "loss": 0.0598, + "step": 64940 + }, + { + "epoch": 4.249263984298332, + "grad_norm": 0.9445711970329285, + "learning_rate": 7.583138565901422e-05, + "loss": 0.0579, + "step": 64950 + }, + { + "epoch": 4.249918220477593, + "grad_norm": 1.0453894138336182, + "learning_rate": 7.582352012519694e-05, + "loss": 0.0719, + "step": 64960 + }, + { + "epoch": 4.250572456656853, + "grad_norm": 0.9736664891242981, + "learning_rate": 7.581565371976813e-05, + "loss": 0.063, + "step": 64970 + }, + { + "epoch": 4.251226692836114, + "grad_norm": 0.7672659158706665, + "learning_rate": 7.580778644299332e-05, + "loss": 0.0648, + "step": 64980 + }, + { + "epoch": 4.251880929015375, + "grad_norm": 0.7975163459777832, + "learning_rate": 7.579991829513802e-05, + "loss": 0.0604, + "step": 64990 + }, + { + "epoch": 4.252535165194635, + "grad_norm": 0.8647359013557434, + "learning_rate": 7.579204927646782e-05, + "loss": 0.0739, + "step": 65000 + }, + { + "epoch": 4.253189401373896, + "grad_norm": 0.7860682606697083, + "learning_rate": 7.57841793872483e-05, + "loss": 0.0621, + "step": 65010 + }, + { + "epoch": 4.253843637553157, + "grad_norm": 0.9216861128807068, + "learning_rate": 7.577630862774515e-05, + "loss": 0.0593, + "step": 65020 + }, + { + "epoch": 4.254497873732418, + "grad_norm": 0.9444064497947693, + "learning_rate": 7.576843699822394e-05, + "loss": 0.0583, + "step": 65030 + }, + { + "epoch": 4.255152109911678, + "grad_norm": 0.833625853061676, + "learning_rate": 7.57605644989504e-05, + "loss": 0.0653, + "step": 65040 + }, + { + "epoch": 4.255806346090939, + "grad_norm": 0.8485620617866516, + "learning_rate": 7.575269113019027e-05, + "loss": 0.0723, + "step": 65050 + }, + { + "epoch": 4.2564605822702, + "grad_norm": 0.9002768397331238, + "learning_rate": 7.574481689220926e-05, + "loss": 0.0667, + "step": 65060 + }, + { + "epoch": 4.257114818449461, + "grad_norm": 0.7625710964202881, + "learning_rate": 7.573694178527316e-05, + "loss": 0.0671, + "step": 65070 + }, + { + "epoch": 4.257769054628721, + "grad_norm": 0.912322998046875, + "learning_rate": 7.572906580964779e-05, + "loss": 0.0707, + "step": 65080 + }, + { + "epoch": 4.258423290807982, + "grad_norm": 0.8781670331954956, + "learning_rate": 7.572118896559896e-05, + "loss": 0.0731, + "step": 65090 + }, + { + "epoch": 4.259077526987243, + "grad_norm": 1.0701106786727905, + "learning_rate": 7.571331125339256e-05, + "loss": 0.062, + "step": 65100 + }, + { + "epoch": 4.259731763166503, + "grad_norm": 1.057981014251709, + "learning_rate": 7.570543267329446e-05, + "loss": 0.0619, + "step": 65110 + }, + { + "epoch": 4.260385999345764, + "grad_norm": 0.6521860361099243, + "learning_rate": 7.56975532255706e-05, + "loss": 0.0532, + "step": 65120 + }, + { + "epoch": 4.261040235525025, + "grad_norm": 0.7353628277778625, + "learning_rate": 7.568967291048692e-05, + "loss": 0.0675, + "step": 65130 + }, + { + "epoch": 4.261694471704285, + "grad_norm": 0.8143835067749023, + "learning_rate": 7.56817917283094e-05, + "loss": 0.0615, + "step": 65140 + }, + { + "epoch": 4.262348707883546, + "grad_norm": 0.7694095373153687, + "learning_rate": 7.567390967930406e-05, + "loss": 0.065, + "step": 65150 + }, + { + "epoch": 4.263002944062807, + "grad_norm": 1.1253188848495483, + "learning_rate": 7.566602676373694e-05, + "loss": 0.0745, + "step": 65160 + }, + { + "epoch": 4.263657180242068, + "grad_norm": 0.760852575302124, + "learning_rate": 7.56581429818741e-05, + "loss": 0.064, + "step": 65170 + }, + { + "epoch": 4.264311416421328, + "grad_norm": 0.808857262134552, + "learning_rate": 7.565025833398164e-05, + "loss": 0.0654, + "step": 65180 + }, + { + "epoch": 4.264965652600589, + "grad_norm": 0.7772423028945923, + "learning_rate": 7.56423728203257e-05, + "loss": 0.0645, + "step": 65190 + }, + { + "epoch": 4.26561988877985, + "grad_norm": 0.8490001559257507, + "learning_rate": 7.563448644117242e-05, + "loss": 0.0618, + "step": 65200 + }, + { + "epoch": 4.26627412495911, + "grad_norm": 0.776184618473053, + "learning_rate": 7.562659919678801e-05, + "loss": 0.0724, + "step": 65210 + }, + { + "epoch": 4.266928361138371, + "grad_norm": 0.765455424785614, + "learning_rate": 7.561871108743865e-05, + "loss": 0.0681, + "step": 65220 + }, + { + "epoch": 4.267582597317632, + "grad_norm": 0.8536463975906372, + "learning_rate": 7.561082211339062e-05, + "loss": 0.0693, + "step": 65230 + }, + { + "epoch": 4.268236833496893, + "grad_norm": 0.7764464616775513, + "learning_rate": 7.560293227491017e-05, + "loss": 0.0532, + "step": 65240 + }, + { + "epoch": 4.268891069676153, + "grad_norm": 0.8680564165115356, + "learning_rate": 7.55950415722636e-05, + "loss": 0.0577, + "step": 65250 + }, + { + "epoch": 4.269545305855414, + "grad_norm": 0.8063961267471313, + "learning_rate": 7.558715000571726e-05, + "loss": 0.0709, + "step": 65260 + }, + { + "epoch": 4.270199542034675, + "grad_norm": 0.8812915086746216, + "learning_rate": 7.55792575755375e-05, + "loss": 0.0586, + "step": 65270 + }, + { + "epoch": 4.270853778213935, + "grad_norm": 0.7072269320487976, + "learning_rate": 7.55713642819907e-05, + "loss": 0.071, + "step": 65280 + }, + { + "epoch": 4.271508014393196, + "grad_norm": 0.9915341138839722, + "learning_rate": 7.55634701253433e-05, + "loss": 0.0548, + "step": 65290 + }, + { + "epoch": 4.272162250572457, + "grad_norm": 0.9940981864929199, + "learning_rate": 7.555557510586175e-05, + "loss": 0.0664, + "step": 65300 + }, + { + "epoch": 4.272816486751718, + "grad_norm": 1.065885305404663, + "learning_rate": 7.554767922381253e-05, + "loss": 0.0663, + "step": 65310 + }, + { + "epoch": 4.273470722930978, + "grad_norm": 0.7399426102638245, + "learning_rate": 7.553978247946212e-05, + "loss": 0.0616, + "step": 65320 + }, + { + "epoch": 4.274124959110239, + "grad_norm": 0.7760506868362427, + "learning_rate": 7.553188487307705e-05, + "loss": 0.0592, + "step": 65330 + }, + { + "epoch": 4.2747791952895, + "grad_norm": 0.9584298133850098, + "learning_rate": 7.552398640492393e-05, + "loss": 0.0646, + "step": 65340 + }, + { + "epoch": 4.27543343146876, + "grad_norm": 0.807236909866333, + "learning_rate": 7.551608707526933e-05, + "loss": 0.0694, + "step": 65350 + }, + { + "epoch": 4.276087667648021, + "grad_norm": 0.9472173452377319, + "learning_rate": 7.550818688437986e-05, + "loss": 0.0608, + "step": 65360 + }, + { + "epoch": 4.276741903827282, + "grad_norm": 0.9590891003608704, + "learning_rate": 7.55002858325222e-05, + "loss": 0.0615, + "step": 65370 + }, + { + "epoch": 4.277396140006543, + "grad_norm": 0.8383166193962097, + "learning_rate": 7.549238391996302e-05, + "loss": 0.0632, + "step": 65380 + }, + { + "epoch": 4.278050376185803, + "grad_norm": 0.7799770832061768, + "learning_rate": 7.5484481146969e-05, + "loss": 0.0631, + "step": 65390 + }, + { + "epoch": 4.278704612365064, + "grad_norm": 1.1362422704696655, + "learning_rate": 7.547657751380694e-05, + "loss": 0.0738, + "step": 65400 + }, + { + "epoch": 4.279358848544325, + "grad_norm": 0.9604383111000061, + "learning_rate": 7.546867302074354e-05, + "loss": 0.0579, + "step": 65410 + }, + { + "epoch": 4.280013084723585, + "grad_norm": 0.9436408877372742, + "learning_rate": 7.546076766804567e-05, + "loss": 0.0669, + "step": 65420 + }, + { + "epoch": 4.280667320902846, + "grad_norm": 0.8412542939186096, + "learning_rate": 7.545286145598007e-05, + "loss": 0.0744, + "step": 65430 + }, + { + "epoch": 4.281321557082107, + "grad_norm": 0.7971706986427307, + "learning_rate": 7.544495438481367e-05, + "loss": 0.0613, + "step": 65440 + }, + { + "epoch": 4.281975793261367, + "grad_norm": 0.8335549235343933, + "learning_rate": 7.543704645481333e-05, + "loss": 0.0696, + "step": 65450 + }, + { + "epoch": 4.282630029440628, + "grad_norm": 0.7970999479293823, + "learning_rate": 7.542913766624596e-05, + "loss": 0.0631, + "step": 65460 + }, + { + "epoch": 4.283284265619889, + "grad_norm": 0.990964949131012, + "learning_rate": 7.542122801937849e-05, + "loss": 0.0677, + "step": 65470 + }, + { + "epoch": 4.28393850179915, + "grad_norm": 1.0644537210464478, + "learning_rate": 7.541331751447792e-05, + "loss": 0.0689, + "step": 65480 + }, + { + "epoch": 4.28459273797841, + "grad_norm": 0.9389525055885315, + "learning_rate": 7.540540615181123e-05, + "loss": 0.0643, + "step": 65490 + }, + { + "epoch": 4.285246974157671, + "grad_norm": 0.8526713252067566, + "learning_rate": 7.539749393164546e-05, + "loss": 0.0662, + "step": 65500 + }, + { + "epoch": 4.285901210336932, + "grad_norm": 0.9678772687911987, + "learning_rate": 7.538958085424765e-05, + "loss": 0.065, + "step": 65510 + }, + { + "epoch": 4.286555446516193, + "grad_norm": 0.8349334001541138, + "learning_rate": 7.53816669198849e-05, + "loss": 0.0737, + "step": 65520 + }, + { + "epoch": 4.287209682695453, + "grad_norm": 0.8528868556022644, + "learning_rate": 7.537375212882433e-05, + "loss": 0.0585, + "step": 65530 + }, + { + "epoch": 4.287863918874714, + "grad_norm": 0.7921110987663269, + "learning_rate": 7.536583648133311e-05, + "loss": 0.0621, + "step": 65540 + }, + { + "epoch": 4.288518155053975, + "grad_norm": 1.04022216796875, + "learning_rate": 7.535791997767834e-05, + "loss": 0.061, + "step": 65550 + }, + { + "epoch": 4.289172391233235, + "grad_norm": 0.7660422325134277, + "learning_rate": 7.535000261812729e-05, + "loss": 0.0618, + "step": 65560 + }, + { + "epoch": 4.289826627412496, + "grad_norm": 1.0147818326950073, + "learning_rate": 7.534208440294717e-05, + "loss": 0.0584, + "step": 65570 + }, + { + "epoch": 4.290480863591757, + "grad_norm": 0.7676869630813599, + "learning_rate": 7.533416533240523e-05, + "loss": 0.0664, + "step": 65580 + }, + { + "epoch": 4.291135099771017, + "grad_norm": 0.7164015173912048, + "learning_rate": 7.532624540676876e-05, + "loss": 0.0643, + "step": 65590 + }, + { + "epoch": 4.291789335950278, + "grad_norm": 0.7891098856925964, + "learning_rate": 7.53183246263051e-05, + "loss": 0.0799, + "step": 65600 + }, + { + "epoch": 4.292443572129539, + "grad_norm": 0.8349225521087646, + "learning_rate": 7.531040299128158e-05, + "loss": 0.0689, + "step": 65610 + }, + { + "epoch": 4.2930978083088, + "grad_norm": 0.9937372803688049, + "learning_rate": 7.530248050196557e-05, + "loss": 0.0657, + "step": 65620 + }, + { + "epoch": 4.29375204448806, + "grad_norm": 0.9106799960136414, + "learning_rate": 7.529455715862452e-05, + "loss": 0.0684, + "step": 65630 + }, + { + "epoch": 4.294406280667321, + "grad_norm": 0.9614897966384888, + "learning_rate": 7.52866329615258e-05, + "loss": 0.0642, + "step": 65640 + }, + { + "epoch": 4.295060516846582, + "grad_norm": 0.914214015007019, + "learning_rate": 7.527870791093691e-05, + "loss": 0.0648, + "step": 65650 + }, + { + "epoch": 4.295714753025842, + "grad_norm": 0.8048765063285828, + "learning_rate": 7.527078200712533e-05, + "loss": 0.0655, + "step": 65660 + }, + { + "epoch": 4.296368989205103, + "grad_norm": 0.9100319743156433, + "learning_rate": 7.526285525035858e-05, + "loss": 0.0678, + "step": 65670 + }, + { + "epoch": 4.297023225384364, + "grad_norm": 0.9633187055587769, + "learning_rate": 7.52549276409042e-05, + "loss": 0.0681, + "step": 65680 + }, + { + "epoch": 4.297677461563625, + "grad_norm": 0.7984033823013306, + "learning_rate": 7.52469991790298e-05, + "loss": 0.0609, + "step": 65690 + }, + { + "epoch": 4.298331697742885, + "grad_norm": 0.9760509729385376, + "learning_rate": 7.523906986500296e-05, + "loss": 0.0739, + "step": 65700 + }, + { + "epoch": 4.298985933922146, + "grad_norm": 1.10667085647583, + "learning_rate": 7.52311396990913e-05, + "loss": 0.0653, + "step": 65710 + }, + { + "epoch": 4.299640170101407, + "grad_norm": 1.0350356101989746, + "learning_rate": 7.522320868156253e-05, + "loss": 0.0656, + "step": 65720 + }, + { + "epoch": 4.300294406280667, + "grad_norm": 1.0190367698669434, + "learning_rate": 7.521527681268431e-05, + "loss": 0.0714, + "step": 65730 + }, + { + "epoch": 4.300948642459928, + "grad_norm": 0.7813534140586853, + "learning_rate": 7.520734409272437e-05, + "loss": 0.0592, + "step": 65740 + }, + { + "epoch": 4.301602878639189, + "grad_norm": 0.9878474473953247, + "learning_rate": 7.519941052195045e-05, + "loss": 0.0642, + "step": 65750 + }, + { + "epoch": 4.30225711481845, + "grad_norm": 0.7413721680641174, + "learning_rate": 7.519147610063035e-05, + "loss": 0.0593, + "step": 65760 + }, + { + "epoch": 4.30291135099771, + "grad_norm": 0.8811038732528687, + "learning_rate": 7.518354082903184e-05, + "loss": 0.0664, + "step": 65770 + }, + { + "epoch": 4.303565587176971, + "grad_norm": 0.9010792970657349, + "learning_rate": 7.517560470742279e-05, + "loss": 0.0669, + "step": 65780 + }, + { + "epoch": 4.304219823356232, + "grad_norm": 0.8902906775474548, + "learning_rate": 7.516766773607107e-05, + "loss": 0.068, + "step": 65790 + }, + { + "epoch": 4.304874059535492, + "grad_norm": 0.9187259674072266, + "learning_rate": 7.515972991524454e-05, + "loss": 0.0681, + "step": 65800 + }, + { + "epoch": 4.305528295714753, + "grad_norm": 0.7792700529098511, + "learning_rate": 7.515179124521116e-05, + "loss": 0.0599, + "step": 65810 + }, + { + "epoch": 4.306182531894014, + "grad_norm": 0.7929428815841675, + "learning_rate": 7.514385172623886e-05, + "loss": 0.0596, + "step": 65820 + }, + { + "epoch": 4.306836768073275, + "grad_norm": 0.9037280678749084, + "learning_rate": 7.513591135859561e-05, + "loss": 0.0637, + "step": 65830 + }, + { + "epoch": 4.307491004252535, + "grad_norm": 0.8752690553665161, + "learning_rate": 7.512797014254944e-05, + "loss": 0.0617, + "step": 65840 + }, + { + "epoch": 4.308145240431796, + "grad_norm": 0.9982262253761292, + "learning_rate": 7.512002807836838e-05, + "loss": 0.0617, + "step": 65850 + }, + { + "epoch": 4.308799476611057, + "grad_norm": 1.1322745084762573, + "learning_rate": 7.511208516632047e-05, + "loss": 0.0665, + "step": 65860 + }, + { + "epoch": 4.309453712790317, + "grad_norm": 0.8582587242126465, + "learning_rate": 7.510414140667385e-05, + "loss": 0.0717, + "step": 65870 + }, + { + "epoch": 4.310107948969578, + "grad_norm": 1.0602346658706665, + "learning_rate": 7.50961967996966e-05, + "loss": 0.0662, + "step": 65880 + }, + { + "epoch": 4.310762185148839, + "grad_norm": 0.9373438954353333, + "learning_rate": 7.508825134565692e-05, + "loss": 0.0596, + "step": 65890 + }, + { + "epoch": 4.311416421328099, + "grad_norm": 0.8777552843093872, + "learning_rate": 7.508030504482296e-05, + "loss": 0.0611, + "step": 65900 + }, + { + "epoch": 4.31207065750736, + "grad_norm": 0.8960667848587036, + "learning_rate": 7.50723578974629e-05, + "loss": 0.0631, + "step": 65910 + }, + { + "epoch": 4.312724893686621, + "grad_norm": 1.027182698249817, + "learning_rate": 7.506440990384502e-05, + "loss": 0.0668, + "step": 65920 + }, + { + "epoch": 4.313379129865882, + "grad_norm": 0.7670314311981201, + "learning_rate": 7.505646106423756e-05, + "loss": 0.0557, + "step": 65930 + }, + { + "epoch": 4.314033366045142, + "grad_norm": 0.9890926480293274, + "learning_rate": 7.504851137890885e-05, + "loss": 0.0718, + "step": 65940 + }, + { + "epoch": 4.314687602224403, + "grad_norm": 0.7140153050422668, + "learning_rate": 7.504056084812718e-05, + "loss": 0.0695, + "step": 65950 + }, + { + "epoch": 4.315341838403664, + "grad_norm": 0.8227149844169617, + "learning_rate": 7.50326094721609e-05, + "loss": 0.0566, + "step": 65960 + }, + { + "epoch": 4.315996074582925, + "grad_norm": 1.0063236951828003, + "learning_rate": 7.502465725127839e-05, + "loss": 0.0601, + "step": 65970 + }, + { + "epoch": 4.316650310762185, + "grad_norm": 0.8858514428138733, + "learning_rate": 7.501670418574808e-05, + "loss": 0.0561, + "step": 65980 + }, + { + "epoch": 4.317304546941446, + "grad_norm": 0.9521268606185913, + "learning_rate": 7.500875027583843e-05, + "loss": 0.0743, + "step": 65990 + }, + { + "epoch": 4.317958783120707, + "grad_norm": 0.860778272151947, + "learning_rate": 7.500079552181782e-05, + "loss": 0.056, + "step": 66000 + }, + { + "epoch": 4.318613019299967, + "grad_norm": 0.6561046838760376, + "learning_rate": 7.499283992395483e-05, + "loss": 0.0579, + "step": 66010 + }, + { + "epoch": 4.319267255479228, + "grad_norm": 0.9803190231323242, + "learning_rate": 7.498488348251794e-05, + "loss": 0.0664, + "step": 66020 + }, + { + "epoch": 4.319921491658489, + "grad_norm": 0.782440185546875, + "learning_rate": 7.497692619777568e-05, + "loss": 0.067, + "step": 66030 + }, + { + "epoch": 4.320575727837749, + "grad_norm": 0.900099515914917, + "learning_rate": 7.496896806999667e-05, + "loss": 0.0708, + "step": 66040 + }, + { + "epoch": 4.32122996401701, + "grad_norm": 0.7509415745735168, + "learning_rate": 7.496100909944952e-05, + "loss": 0.0639, + "step": 66050 + }, + { + "epoch": 4.321884200196271, + "grad_norm": 0.8425498008728027, + "learning_rate": 7.495304928640284e-05, + "loss": 0.0697, + "step": 66060 + }, + { + "epoch": 4.322538436375532, + "grad_norm": 0.8149582147598267, + "learning_rate": 7.494508863112529e-05, + "loss": 0.0615, + "step": 66070 + }, + { + "epoch": 4.323192672554792, + "grad_norm": 0.8617916703224182, + "learning_rate": 7.49371271338856e-05, + "loss": 0.0663, + "step": 66080 + }, + { + "epoch": 4.323846908734053, + "grad_norm": 0.7951449155807495, + "learning_rate": 7.492916479495246e-05, + "loss": 0.061, + "step": 66090 + }, + { + "epoch": 4.324501144913314, + "grad_norm": 0.9237793684005737, + "learning_rate": 7.492120161459463e-05, + "loss": 0.0655, + "step": 66100 + }, + { + "epoch": 4.325155381092574, + "grad_norm": 0.8202549815177917, + "learning_rate": 7.491323759308089e-05, + "loss": 0.0588, + "step": 66110 + }, + { + "epoch": 4.325809617271835, + "grad_norm": 0.8115940690040588, + "learning_rate": 7.490527273068003e-05, + "loss": 0.0609, + "step": 66120 + }, + { + "epoch": 4.326463853451096, + "grad_norm": 0.8623285889625549, + "learning_rate": 7.489730702766092e-05, + "loss": 0.0694, + "step": 66130 + }, + { + "epoch": 4.327118089630357, + "grad_norm": 0.9182894825935364, + "learning_rate": 7.488934048429239e-05, + "loss": 0.0687, + "step": 66140 + }, + { + "epoch": 4.327772325809617, + "grad_norm": 0.8562094569206238, + "learning_rate": 7.488137310084334e-05, + "loss": 0.0567, + "step": 66150 + }, + { + "epoch": 4.328426561988878, + "grad_norm": 0.6775592565536499, + "learning_rate": 7.487340487758271e-05, + "loss": 0.0616, + "step": 66160 + }, + { + "epoch": 4.329080798168139, + "grad_norm": 0.9210445880889893, + "learning_rate": 7.486543581477942e-05, + "loss": 0.066, + "step": 66170 + }, + { + "epoch": 4.329735034347399, + "grad_norm": 0.8640944957733154, + "learning_rate": 7.485746591270247e-05, + "loss": 0.0679, + "step": 66180 + }, + { + "epoch": 4.33038927052666, + "grad_norm": 0.9306260943412781, + "learning_rate": 7.484949517162083e-05, + "loss": 0.0677, + "step": 66190 + }, + { + "epoch": 4.331043506705921, + "grad_norm": 0.9359939098358154, + "learning_rate": 7.484152359180358e-05, + "loss": 0.063, + "step": 66200 + }, + { + "epoch": 4.331697742885182, + "grad_norm": 0.6757792830467224, + "learning_rate": 7.483355117351975e-05, + "loss": 0.0645, + "step": 66210 + }, + { + "epoch": 4.332351979064442, + "grad_norm": 1.1225383281707764, + "learning_rate": 7.482557791703843e-05, + "loss": 0.0551, + "step": 66220 + }, + { + "epoch": 4.333006215243703, + "grad_norm": 0.8238813877105713, + "learning_rate": 7.481760382262876e-05, + "loss": 0.0648, + "step": 66230 + }, + { + "epoch": 4.333660451422964, + "grad_norm": 0.8821508288383484, + "learning_rate": 7.480962889055989e-05, + "loss": 0.064, + "step": 66240 + }, + { + "epoch": 4.334314687602224, + "grad_norm": 0.8771295547485352, + "learning_rate": 7.480165312110096e-05, + "loss": 0.0546, + "step": 66250 + }, + { + "epoch": 4.334968923781485, + "grad_norm": 0.9171880483627319, + "learning_rate": 7.479367651452119e-05, + "loss": 0.0613, + "step": 66260 + }, + { + "epoch": 4.335623159960746, + "grad_norm": 0.8392135500907898, + "learning_rate": 7.478569907108983e-05, + "loss": 0.0664, + "step": 66270 + }, + { + "epoch": 4.336277396140007, + "grad_norm": 0.8099778890609741, + "learning_rate": 7.477772079107612e-05, + "loss": 0.0602, + "step": 66280 + }, + { + "epoch": 4.336931632319267, + "grad_norm": 0.8127948641777039, + "learning_rate": 7.476974167474934e-05, + "loss": 0.0604, + "step": 66290 + }, + { + "epoch": 4.337585868498528, + "grad_norm": 0.9666029214859009, + "learning_rate": 7.476176172237883e-05, + "loss": 0.061, + "step": 66300 + }, + { + "epoch": 4.338240104677789, + "grad_norm": 0.9391130208969116, + "learning_rate": 7.475378093423391e-05, + "loss": 0.0647, + "step": 66310 + }, + { + "epoch": 4.338894340857049, + "grad_norm": 0.7683467864990234, + "learning_rate": 7.474579931058397e-05, + "loss": 0.0622, + "step": 66320 + }, + { + "epoch": 4.33954857703631, + "grad_norm": 0.8079342246055603, + "learning_rate": 7.47378168516984e-05, + "loss": 0.0572, + "step": 66330 + }, + { + "epoch": 4.340202813215571, + "grad_norm": 1.2922800779342651, + "learning_rate": 7.472983355784664e-05, + "loss": 0.0781, + "step": 66340 + }, + { + "epoch": 4.340857049394831, + "grad_norm": 0.7486710548400879, + "learning_rate": 7.472184942929815e-05, + "loss": 0.0631, + "step": 66350 + }, + { + "epoch": 4.341511285574092, + "grad_norm": 0.7846554517745972, + "learning_rate": 7.471386446632238e-05, + "loss": 0.063, + "step": 66360 + }, + { + "epoch": 4.342165521753353, + "grad_norm": 0.6966447234153748, + "learning_rate": 7.470587866918889e-05, + "loss": 0.0614, + "step": 66370 + }, + { + "epoch": 4.342819757932614, + "grad_norm": 0.8768433928489685, + "learning_rate": 7.469789203816719e-05, + "loss": 0.0598, + "step": 66380 + }, + { + "epoch": 4.343473994111874, + "grad_norm": 1.1041473150253296, + "learning_rate": 7.468990457352687e-05, + "loss": 0.0684, + "step": 66390 + }, + { + "epoch": 4.344128230291135, + "grad_norm": 0.9726035594940186, + "learning_rate": 7.468191627553753e-05, + "loss": 0.0635, + "step": 66400 + }, + { + "epoch": 4.344782466470396, + "grad_norm": 0.9272595643997192, + "learning_rate": 7.467392714446876e-05, + "loss": 0.0695, + "step": 66410 + }, + { + "epoch": 4.345436702649657, + "grad_norm": 0.8459830284118652, + "learning_rate": 7.466593718059026e-05, + "loss": 0.0527, + "step": 66420 + }, + { + "epoch": 4.346090938828917, + "grad_norm": 0.9491352438926697, + "learning_rate": 7.465794638417167e-05, + "loss": 0.0611, + "step": 66430 + }, + { + "epoch": 4.346745175008178, + "grad_norm": 0.9533461928367615, + "learning_rate": 7.464995475548275e-05, + "loss": 0.0687, + "step": 66440 + }, + { + "epoch": 4.347399411187439, + "grad_norm": 0.8215408325195312, + "learning_rate": 7.464196229479317e-05, + "loss": 0.0617, + "step": 66450 + }, + { + "epoch": 4.348053647366699, + "grad_norm": 0.7235555052757263, + "learning_rate": 7.463396900237277e-05, + "loss": 0.058, + "step": 66460 + }, + { + "epoch": 4.34870788354596, + "grad_norm": 0.8705189824104309, + "learning_rate": 7.462597487849131e-05, + "loss": 0.0595, + "step": 66470 + }, + { + "epoch": 4.349362119725221, + "grad_norm": 0.906433641910553, + "learning_rate": 7.461797992341861e-05, + "loss": 0.0626, + "step": 66480 + }, + { + "epoch": 4.350016355904481, + "grad_norm": 0.7737306356430054, + "learning_rate": 7.460998413742451e-05, + "loss": 0.0704, + "step": 66490 + }, + { + "epoch": 4.350670592083742, + "grad_norm": 0.782434344291687, + "learning_rate": 7.460198752077892e-05, + "loss": 0.0603, + "step": 66500 + }, + { + "epoch": 4.351324828263003, + "grad_norm": 0.7962082028388977, + "learning_rate": 7.459399007375172e-05, + "loss": 0.0656, + "step": 66510 + }, + { + "epoch": 4.351979064442264, + "grad_norm": 0.8921648263931274, + "learning_rate": 7.458599179661286e-05, + "loss": 0.0634, + "step": 66520 + }, + { + "epoch": 4.352633300621524, + "grad_norm": 0.868022084236145, + "learning_rate": 7.45779926896323e-05, + "loss": 0.0594, + "step": 66530 + }, + { + "epoch": 4.353287536800785, + "grad_norm": 1.0638127326965332, + "learning_rate": 7.456999275308002e-05, + "loss": 0.0655, + "step": 66540 + }, + { + "epoch": 4.353941772980046, + "grad_norm": 0.8913347721099854, + "learning_rate": 7.456199198722604e-05, + "loss": 0.056, + "step": 66550 + }, + { + "epoch": 4.354596009159306, + "grad_norm": 0.9430878162384033, + "learning_rate": 7.455399039234043e-05, + "loss": 0.0648, + "step": 66560 + }, + { + "epoch": 4.355250245338567, + "grad_norm": 1.181059718132019, + "learning_rate": 7.454598796869325e-05, + "loss": 0.0701, + "step": 66570 + }, + { + "epoch": 4.355904481517828, + "grad_norm": 0.9202277064323425, + "learning_rate": 7.45379847165546e-05, + "loss": 0.0633, + "step": 66580 + }, + { + "epoch": 4.356558717697089, + "grad_norm": 0.9049434661865234, + "learning_rate": 7.45299806361946e-05, + "loss": 0.0625, + "step": 66590 + }, + { + "epoch": 4.357212953876349, + "grad_norm": 1.0947158336639404, + "learning_rate": 7.452197572788345e-05, + "loss": 0.0639, + "step": 66600 + }, + { + "epoch": 4.35786719005561, + "grad_norm": 0.9060896635055542, + "learning_rate": 7.451396999189129e-05, + "loss": 0.0599, + "step": 66610 + }, + { + "epoch": 4.358521426234871, + "grad_norm": 0.9136892557144165, + "learning_rate": 7.450596342848835e-05, + "loss": 0.0671, + "step": 66620 + }, + { + "epoch": 4.359175662414131, + "grad_norm": 0.7498853802680969, + "learning_rate": 7.449795603794487e-05, + "loss": 0.0585, + "step": 66630 + }, + { + "epoch": 4.359829898593392, + "grad_norm": 0.9020270705223083, + "learning_rate": 7.448994782053114e-05, + "loss": 0.0678, + "step": 66640 + }, + { + "epoch": 4.360484134772653, + "grad_norm": 0.9565746188163757, + "learning_rate": 7.448193877651743e-05, + "loss": 0.0662, + "step": 66650 + }, + { + "epoch": 4.361138370951914, + "grad_norm": 0.9026892185211182, + "learning_rate": 7.447392890617408e-05, + "loss": 0.0647, + "step": 66660 + }, + { + "epoch": 4.361792607131174, + "grad_norm": 1.156490683555603, + "learning_rate": 7.446591820977144e-05, + "loss": 0.0648, + "step": 66670 + }, + { + "epoch": 4.362446843310435, + "grad_norm": 0.8447253704071045, + "learning_rate": 7.445790668757992e-05, + "loss": 0.0662, + "step": 66680 + }, + { + "epoch": 4.363101079489696, + "grad_norm": 0.8401336073875427, + "learning_rate": 7.44498943398699e-05, + "loss": 0.0621, + "step": 66690 + }, + { + "epoch": 4.363755315668956, + "grad_norm": 0.88148033618927, + "learning_rate": 7.44418811669118e-05, + "loss": 0.0688, + "step": 66700 + }, + { + "epoch": 4.364409551848217, + "grad_norm": 1.0117405652999878, + "learning_rate": 7.443386716897614e-05, + "loss": 0.0547, + "step": 66710 + }, + { + "epoch": 4.365063788027478, + "grad_norm": 0.9008330702781677, + "learning_rate": 7.442585234633337e-05, + "loss": 0.0676, + "step": 66720 + }, + { + "epoch": 4.365718024206739, + "grad_norm": 0.8661233186721802, + "learning_rate": 7.441783669925402e-05, + "loss": 0.0613, + "step": 66730 + }, + { + "epoch": 4.366372260385999, + "grad_norm": 0.849505603313446, + "learning_rate": 7.440982022800864e-05, + "loss": 0.0652, + "step": 66740 + }, + { + "epoch": 4.36702649656526, + "grad_norm": 0.7633360028266907, + "learning_rate": 7.440180293286783e-05, + "loss": 0.0601, + "step": 66750 + }, + { + "epoch": 4.367680732744521, + "grad_norm": 0.9463858008384705, + "learning_rate": 7.439378481410215e-05, + "loss": 0.0651, + "step": 66760 + }, + { + "epoch": 4.368334968923781, + "grad_norm": 0.9335405826568604, + "learning_rate": 7.438576587198228e-05, + "loss": 0.0639, + "step": 66770 + }, + { + "epoch": 4.368989205103042, + "grad_norm": 0.8489439487457275, + "learning_rate": 7.437774610677884e-05, + "loss": 0.057, + "step": 66780 + }, + { + "epoch": 4.369643441282303, + "grad_norm": 0.9657695889472961, + "learning_rate": 7.436972551876255e-05, + "loss": 0.0632, + "step": 66790 + }, + { + "epoch": 4.370297677461563, + "grad_norm": 0.8305515050888062, + "learning_rate": 7.43617041082041e-05, + "loss": 0.0664, + "step": 66800 + }, + { + "epoch": 4.370951913640824, + "grad_norm": 1.0466861724853516, + "learning_rate": 7.435368187537424e-05, + "loss": 0.057, + "step": 66810 + }, + { + "epoch": 4.371606149820085, + "grad_norm": 1.0870552062988281, + "learning_rate": 7.434565882054377e-05, + "loss": 0.0614, + "step": 66820 + }, + { + "epoch": 4.372260385999346, + "grad_norm": 0.9726265072822571, + "learning_rate": 7.433763494398345e-05, + "loss": 0.0579, + "step": 66830 + }, + { + "epoch": 4.372914622178606, + "grad_norm": 0.8406111001968384, + "learning_rate": 7.432961024596413e-05, + "loss": 0.0625, + "step": 66840 + }, + { + "epoch": 4.373568858357867, + "grad_norm": 0.9154638648033142, + "learning_rate": 7.432158472675665e-05, + "loss": 0.0695, + "step": 66850 + }, + { + "epoch": 4.374223094537128, + "grad_norm": 0.8148864507675171, + "learning_rate": 7.43135583866319e-05, + "loss": 0.0657, + "step": 66860 + }, + { + "epoch": 4.374877330716389, + "grad_norm": 1.0831327438354492, + "learning_rate": 7.430553122586079e-05, + "loss": 0.0759, + "step": 66870 + }, + { + "epoch": 4.375531566895649, + "grad_norm": 0.7323132157325745, + "learning_rate": 7.429750324471425e-05, + "loss": 0.0575, + "step": 66880 + }, + { + "epoch": 4.37618580307491, + "grad_norm": 0.7715146541595459, + "learning_rate": 7.428947444346327e-05, + "loss": 0.0591, + "step": 66890 + }, + { + "epoch": 4.376840039254171, + "grad_norm": 0.7162055969238281, + "learning_rate": 7.428144482237882e-05, + "loss": 0.0633, + "step": 66900 + }, + { + "epoch": 4.377494275433431, + "grad_norm": 0.9126048684120178, + "learning_rate": 7.427341438173192e-05, + "loss": 0.0738, + "step": 66910 + }, + { + "epoch": 4.378148511612692, + "grad_norm": 0.8502604365348816, + "learning_rate": 7.426538312179364e-05, + "loss": 0.0591, + "step": 66920 + }, + { + "epoch": 4.378802747791953, + "grad_norm": 1.0491002798080444, + "learning_rate": 7.425735104283502e-05, + "loss": 0.0642, + "step": 66930 + }, + { + "epoch": 4.379456983971213, + "grad_norm": 1.0379705429077148, + "learning_rate": 7.424931814512721e-05, + "loss": 0.0692, + "step": 66940 + }, + { + "epoch": 4.380111220150474, + "grad_norm": 0.9952113032341003, + "learning_rate": 7.42412844289413e-05, + "loss": 0.075, + "step": 66950 + }, + { + "epoch": 4.380765456329735, + "grad_norm": 0.9746718406677246, + "learning_rate": 7.423324989454847e-05, + "loss": 0.0598, + "step": 66960 + }, + { + "epoch": 4.381419692508996, + "grad_norm": 0.8205549716949463, + "learning_rate": 7.42252145422199e-05, + "loss": 0.0565, + "step": 66970 + }, + { + "epoch": 4.382073928688256, + "grad_norm": 0.9923220276832581, + "learning_rate": 7.421717837222682e-05, + "loss": 0.0717, + "step": 66980 + }, + { + "epoch": 4.382728164867517, + "grad_norm": 0.804681658744812, + "learning_rate": 7.420914138484045e-05, + "loss": 0.0645, + "step": 66990 + }, + { + "epoch": 4.383382401046778, + "grad_norm": 0.7509680390357971, + "learning_rate": 7.420110358033205e-05, + "loss": 0.0627, + "step": 67000 + }, + { + "epoch": 4.384036637226038, + "grad_norm": 0.7687981128692627, + "learning_rate": 7.419306495897295e-05, + "loss": 0.069, + "step": 67010 + }, + { + "epoch": 4.384690873405299, + "grad_norm": 0.8498753309249878, + "learning_rate": 7.418502552103446e-05, + "loss": 0.0653, + "step": 67020 + }, + { + "epoch": 4.38534510958456, + "grad_norm": 0.9675426483154297, + "learning_rate": 7.417698526678792e-05, + "loss": 0.0728, + "step": 67030 + }, + { + "epoch": 4.385999345763821, + "grad_norm": 0.9091965556144714, + "learning_rate": 7.416894419650473e-05, + "loss": 0.0641, + "step": 67040 + }, + { + "epoch": 4.386653581943081, + "grad_norm": 0.8700924515724182, + "learning_rate": 7.416090231045629e-05, + "loss": 0.057, + "step": 67050 + }, + { + "epoch": 4.387307818122342, + "grad_norm": 0.7336810231208801, + "learning_rate": 7.415285960891403e-05, + "loss": 0.0683, + "step": 67060 + }, + { + "epoch": 4.387962054301603, + "grad_norm": 0.9728383421897888, + "learning_rate": 7.414481609214941e-05, + "loss": 0.0596, + "step": 67070 + }, + { + "epoch": 4.388616290480863, + "grad_norm": 0.6819276809692383, + "learning_rate": 7.413677176043393e-05, + "loss": 0.0561, + "step": 67080 + }, + { + "epoch": 4.389270526660124, + "grad_norm": 0.9143689870834351, + "learning_rate": 7.41287266140391e-05, + "loss": 0.0592, + "step": 67090 + }, + { + "epoch": 4.389924762839385, + "grad_norm": 0.8440172672271729, + "learning_rate": 7.412068065323648e-05, + "loss": 0.0643, + "step": 67100 + }, + { + "epoch": 4.390578999018646, + "grad_norm": 0.8751739859580994, + "learning_rate": 7.411263387829761e-05, + "loss": 0.0652, + "step": 67110 + }, + { + "epoch": 4.391233235197906, + "grad_norm": 1.0761669874191284, + "learning_rate": 7.41045862894941e-05, + "loss": 0.0728, + "step": 67120 + }, + { + "epoch": 4.391887471377167, + "grad_norm": 0.7881830334663391, + "learning_rate": 7.409653788709762e-05, + "loss": 0.0655, + "step": 67130 + }, + { + "epoch": 4.392541707556428, + "grad_norm": 0.9639447331428528, + "learning_rate": 7.408848867137977e-05, + "loss": 0.0584, + "step": 67140 + }, + { + "epoch": 4.393195943735688, + "grad_norm": 0.8875806331634521, + "learning_rate": 7.408043864261225e-05, + "loss": 0.0745, + "step": 67150 + }, + { + "epoch": 4.393850179914949, + "grad_norm": 0.7383980751037598, + "learning_rate": 7.407238780106679e-05, + "loss": 0.0572, + "step": 67160 + }, + { + "epoch": 4.39450441609421, + "grad_norm": 0.848702609539032, + "learning_rate": 7.406433614701509e-05, + "loss": 0.0632, + "step": 67170 + }, + { + "epoch": 4.395158652273471, + "grad_norm": 0.7714468240737915, + "learning_rate": 7.405628368072894e-05, + "loss": 0.0595, + "step": 67180 + }, + { + "epoch": 4.395812888452731, + "grad_norm": 0.775048017501831, + "learning_rate": 7.404823040248013e-05, + "loss": 0.0612, + "step": 67190 + }, + { + "epoch": 4.396467124631992, + "grad_norm": 0.6783040165901184, + "learning_rate": 7.404017631254047e-05, + "loss": 0.0576, + "step": 67200 + }, + { + "epoch": 4.397121360811253, + "grad_norm": 0.9950321912765503, + "learning_rate": 7.403212141118182e-05, + "loss": 0.0645, + "step": 67210 + }, + { + "epoch": 4.397775596990513, + "grad_norm": 0.9404067993164062, + "learning_rate": 7.402406569867604e-05, + "loss": 0.0632, + "step": 67220 + }, + { + "epoch": 4.398429833169774, + "grad_norm": 0.8246940970420837, + "learning_rate": 7.401600917529504e-05, + "loss": 0.0574, + "step": 67230 + }, + { + "epoch": 4.399084069349035, + "grad_norm": 0.8735194802284241, + "learning_rate": 7.400795184131075e-05, + "loss": 0.0571, + "step": 67240 + }, + { + "epoch": 4.399738305528295, + "grad_norm": 0.8854332566261292, + "learning_rate": 7.399989369699512e-05, + "loss": 0.0599, + "step": 67250 + }, + { + "epoch": 4.400392541707556, + "grad_norm": 1.0150554180145264, + "learning_rate": 7.399183474262011e-05, + "loss": 0.0703, + "step": 67260 + }, + { + "epoch": 4.401046777886817, + "grad_norm": 1.0136165618896484, + "learning_rate": 7.398377497845779e-05, + "loss": 0.0587, + "step": 67270 + }, + { + "epoch": 4.401701014066078, + "grad_norm": 0.7560604214668274, + "learning_rate": 7.397571440478015e-05, + "loss": 0.0743, + "step": 67280 + }, + { + "epoch": 4.402355250245338, + "grad_norm": 0.8288975358009338, + "learning_rate": 7.396765302185928e-05, + "loss": 0.0663, + "step": 67290 + }, + { + "epoch": 4.403009486424599, + "grad_norm": 1.1244462728500366, + "learning_rate": 7.395959082996725e-05, + "loss": 0.0719, + "step": 67300 + }, + { + "epoch": 4.40366372260386, + "grad_norm": 0.8826910257339478, + "learning_rate": 7.39515278293762e-05, + "loss": 0.0693, + "step": 67310 + }, + { + "epoch": 4.404317958783121, + "grad_norm": 0.8879522681236267, + "learning_rate": 7.394346402035828e-05, + "loss": 0.0762, + "step": 67320 + }, + { + "epoch": 4.404972194962381, + "grad_norm": 0.8239995241165161, + "learning_rate": 7.393539940318563e-05, + "loss": 0.0548, + "step": 67330 + }, + { + "epoch": 4.405626431141642, + "grad_norm": 1.0602549314498901, + "learning_rate": 7.39273339781305e-05, + "loss": 0.0598, + "step": 67340 + }, + { + "epoch": 4.406280667320903, + "grad_norm": 1.0400123596191406, + "learning_rate": 7.391926774546509e-05, + "loss": 0.0723, + "step": 67350 + }, + { + "epoch": 4.406934903500163, + "grad_norm": 0.9202961325645447, + "learning_rate": 7.391120070546165e-05, + "loss": 0.0636, + "step": 67360 + }, + { + "epoch": 4.407589139679424, + "grad_norm": 0.8202659487724304, + "learning_rate": 7.39031328583925e-05, + "loss": 0.0651, + "step": 67370 + }, + { + "epoch": 4.408243375858685, + "grad_norm": 0.8003469109535217, + "learning_rate": 7.389506420452991e-05, + "loss": 0.0572, + "step": 67380 + }, + { + "epoch": 4.408897612037945, + "grad_norm": 0.8167177438735962, + "learning_rate": 7.388699474414624e-05, + "loss": 0.0746, + "step": 67390 + }, + { + "epoch": 4.409551848217206, + "grad_norm": 0.949032187461853, + "learning_rate": 7.387892447751387e-05, + "loss": 0.0654, + "step": 67400 + }, + { + "epoch": 4.410206084396467, + "grad_norm": 0.9453916549682617, + "learning_rate": 7.387085340490514e-05, + "loss": 0.0617, + "step": 67410 + }, + { + "epoch": 4.410860320575728, + "grad_norm": 1.0809295177459717, + "learning_rate": 7.386278152659254e-05, + "loss": 0.0699, + "step": 67420 + }, + { + "epoch": 4.411514556754988, + "grad_norm": 1.0627373456954956, + "learning_rate": 7.385470884284845e-05, + "loss": 0.0636, + "step": 67430 + }, + { + "epoch": 4.412168792934249, + "grad_norm": 0.8991785645484924, + "learning_rate": 7.384663535394541e-05, + "loss": 0.064, + "step": 67440 + }, + { + "epoch": 4.41282302911351, + "grad_norm": 0.9950599074363708, + "learning_rate": 7.383856106015585e-05, + "loss": 0.0611, + "step": 67450 + }, + { + "epoch": 4.41347726529277, + "grad_norm": 0.7597075700759888, + "learning_rate": 7.383048596175236e-05, + "loss": 0.0645, + "step": 67460 + }, + { + "epoch": 4.414131501472031, + "grad_norm": 0.9785023927688599, + "learning_rate": 7.382241005900745e-05, + "loss": 0.062, + "step": 67470 + }, + { + "epoch": 4.414785737651292, + "grad_norm": 0.8805351257324219, + "learning_rate": 7.381433335219374e-05, + "loss": 0.0663, + "step": 67480 + }, + { + "epoch": 4.415439973830553, + "grad_norm": 0.7586386203765869, + "learning_rate": 7.38062558415838e-05, + "loss": 0.0644, + "step": 67490 + }, + { + "epoch": 4.416094210009813, + "grad_norm": 0.8569243550300598, + "learning_rate": 7.379817752745033e-05, + "loss": 0.0602, + "step": 67500 + }, + { + "epoch": 4.416748446189074, + "grad_norm": 0.8760682344436646, + "learning_rate": 7.379009841006593e-05, + "loss": 0.0596, + "step": 67510 + }, + { + "epoch": 4.417402682368335, + "grad_norm": 0.9443554878234863, + "learning_rate": 7.378201848970332e-05, + "loss": 0.0648, + "step": 67520 + }, + { + "epoch": 4.418056918547595, + "grad_norm": 0.9215699434280396, + "learning_rate": 7.377393776663523e-05, + "loss": 0.0693, + "step": 67530 + }, + { + "epoch": 4.418711154726856, + "grad_norm": 1.1914721727371216, + "learning_rate": 7.376585624113437e-05, + "loss": 0.0702, + "step": 67540 + }, + { + "epoch": 4.419365390906117, + "grad_norm": 0.8695497512817383, + "learning_rate": 7.375777391347355e-05, + "loss": 0.0529, + "step": 67550 + }, + { + "epoch": 4.420019627085378, + "grad_norm": 0.8049673438072205, + "learning_rate": 7.374969078392555e-05, + "loss": 0.0646, + "step": 67560 + }, + { + "epoch": 4.420673863264638, + "grad_norm": 1.0911378860473633, + "learning_rate": 7.37416068527632e-05, + "loss": 0.0677, + "step": 67570 + }, + { + "epoch": 4.421328099443899, + "grad_norm": 0.8425750136375427, + "learning_rate": 7.373352212025935e-05, + "loss": 0.0604, + "step": 67580 + }, + { + "epoch": 4.42198233562316, + "grad_norm": 1.0606712102890015, + "learning_rate": 7.372543658668688e-05, + "loss": 0.0569, + "step": 67590 + }, + { + "epoch": 4.42263657180242, + "grad_norm": 0.8173533082008362, + "learning_rate": 7.371735025231871e-05, + "loss": 0.0595, + "step": 67600 + }, + { + "epoch": 4.423290807981681, + "grad_norm": 1.046075701713562, + "learning_rate": 7.370926311742776e-05, + "loss": 0.0632, + "step": 67610 + }, + { + "epoch": 4.423945044160942, + "grad_norm": 0.6317776441574097, + "learning_rate": 7.3701175182287e-05, + "loss": 0.0688, + "step": 67620 + }, + { + "epoch": 4.424599280340203, + "grad_norm": 0.8393598198890686, + "learning_rate": 7.369308644716944e-05, + "loss": 0.0633, + "step": 67630 + }, + { + "epoch": 4.425253516519463, + "grad_norm": 0.9631373286247253, + "learning_rate": 7.368499691234806e-05, + "loss": 0.0578, + "step": 67640 + }, + { + "epoch": 4.425907752698724, + "grad_norm": 0.829046905040741, + "learning_rate": 7.367690657809592e-05, + "loss": 0.0647, + "step": 67650 + }, + { + "epoch": 4.426561988877985, + "grad_norm": 0.9944875836372375, + "learning_rate": 7.366881544468609e-05, + "loss": 0.0598, + "step": 67660 + }, + { + "epoch": 4.427216225057245, + "grad_norm": 0.9320645928382874, + "learning_rate": 7.366072351239165e-05, + "loss": 0.0583, + "step": 67670 + }, + { + "epoch": 4.427870461236506, + "grad_norm": 1.0392950773239136, + "learning_rate": 7.365263078148575e-05, + "loss": 0.0623, + "step": 67680 + }, + { + "epoch": 4.428524697415767, + "grad_norm": 0.7760281562805176, + "learning_rate": 7.364453725224154e-05, + "loss": 0.0646, + "step": 67690 + }, + { + "epoch": 4.429178933595027, + "grad_norm": 0.8868238925933838, + "learning_rate": 7.363644292493218e-05, + "loss": 0.0664, + "step": 67700 + }, + { + "epoch": 4.429833169774288, + "grad_norm": 0.9134417772293091, + "learning_rate": 7.362834779983087e-05, + "loss": 0.0597, + "step": 67710 + }, + { + "epoch": 4.430487405953549, + "grad_norm": 0.7500336170196533, + "learning_rate": 7.362025187721086e-05, + "loss": 0.0573, + "step": 67720 + }, + { + "epoch": 4.43114164213281, + "grad_norm": 0.777053713798523, + "learning_rate": 7.361215515734541e-05, + "loss": 0.0696, + "step": 67730 + }, + { + "epoch": 4.43179587831207, + "grad_norm": 0.8513823747634888, + "learning_rate": 7.36040576405078e-05, + "loss": 0.0611, + "step": 67740 + }, + { + "epoch": 4.432450114491331, + "grad_norm": 0.7951275706291199, + "learning_rate": 7.359595932697134e-05, + "loss": 0.06, + "step": 67750 + }, + { + "epoch": 4.433104350670592, + "grad_norm": 0.9466954469680786, + "learning_rate": 7.358786021700936e-05, + "loss": 0.0645, + "step": 67760 + }, + { + "epoch": 4.433758586849853, + "grad_norm": 0.8108925819396973, + "learning_rate": 7.357976031089524e-05, + "loss": 0.0666, + "step": 67770 + }, + { + "epoch": 4.434412823029113, + "grad_norm": 0.8179001212120056, + "learning_rate": 7.357165960890237e-05, + "loss": 0.069, + "step": 67780 + }, + { + "epoch": 4.435067059208374, + "grad_norm": 0.8625308275222778, + "learning_rate": 7.356355811130419e-05, + "loss": 0.0601, + "step": 67790 + }, + { + "epoch": 4.435721295387635, + "grad_norm": 1.066897988319397, + "learning_rate": 7.35554558183741e-05, + "loss": 0.069, + "step": 67800 + }, + { + "epoch": 4.436375531566895, + "grad_norm": 0.8988897800445557, + "learning_rate": 7.35473527303856e-05, + "loss": 0.0809, + "step": 67810 + }, + { + "epoch": 4.437029767746156, + "grad_norm": 0.8011250495910645, + "learning_rate": 7.35392488476122e-05, + "loss": 0.0633, + "step": 67820 + }, + { + "epoch": 4.437684003925417, + "grad_norm": 0.9169591665267944, + "learning_rate": 7.353114417032742e-05, + "loss": 0.057, + "step": 67830 + }, + { + "epoch": 4.438338240104677, + "grad_norm": 0.9640666246414185, + "learning_rate": 7.35230386988048e-05, + "loss": 0.0624, + "step": 67840 + }, + { + "epoch": 4.438992476283938, + "grad_norm": 0.7338112592697144, + "learning_rate": 7.351493243331794e-05, + "loss": 0.0615, + "step": 67850 + }, + { + "epoch": 4.439646712463199, + "grad_norm": 0.8081340789794922, + "learning_rate": 7.350682537414044e-05, + "loss": 0.0662, + "step": 67860 + }, + { + "epoch": 4.44030094864246, + "grad_norm": 1.210657000541687, + "learning_rate": 7.349871752154593e-05, + "loss": 0.0784, + "step": 67870 + }, + { + "epoch": 4.44095518482172, + "grad_norm": 0.9144163727760315, + "learning_rate": 7.349060887580808e-05, + "loss": 0.0635, + "step": 67880 + }, + { + "epoch": 4.441609421000981, + "grad_norm": 0.9125617742538452, + "learning_rate": 7.348249943720058e-05, + "loss": 0.0627, + "step": 67890 + }, + { + "epoch": 4.442263657180242, + "grad_norm": 0.8639421463012695, + "learning_rate": 7.347438920599712e-05, + "loss": 0.0649, + "step": 67900 + }, + { + "epoch": 4.442917893359502, + "grad_norm": 0.795691728591919, + "learning_rate": 7.346627818247149e-05, + "loss": 0.0674, + "step": 67910 + }, + { + "epoch": 4.443572129538763, + "grad_norm": 0.9141677618026733, + "learning_rate": 7.34581663668974e-05, + "loss": 0.0657, + "step": 67920 + }, + { + "epoch": 4.444226365718024, + "grad_norm": 0.9653276205062866, + "learning_rate": 7.345005375954869e-05, + "loss": 0.0506, + "step": 67930 + }, + { + "epoch": 4.444880601897285, + "grad_norm": 1.0340138673782349, + "learning_rate": 7.344194036069916e-05, + "loss": 0.0768, + "step": 67940 + }, + { + "epoch": 4.445534838076545, + "grad_norm": 0.7893163561820984, + "learning_rate": 7.343382617062266e-05, + "loss": 0.0591, + "step": 67950 + }, + { + "epoch": 4.446189074255806, + "grad_norm": 0.840461015701294, + "learning_rate": 7.342571118959307e-05, + "loss": 0.059, + "step": 67960 + }, + { + "epoch": 4.446843310435067, + "grad_norm": 1.0385587215423584, + "learning_rate": 7.34175954178843e-05, + "loss": 0.0784, + "step": 67970 + }, + { + "epoch": 4.447497546614327, + "grad_norm": 0.9671325087547302, + "learning_rate": 7.340947885577028e-05, + "loss": 0.0583, + "step": 67980 + }, + { + "epoch": 4.448151782793588, + "grad_norm": 0.8433911800384521, + "learning_rate": 7.340136150352492e-05, + "loss": 0.0611, + "step": 67990 + }, + { + "epoch": 4.448806018972849, + "grad_norm": 0.8698466420173645, + "learning_rate": 7.339324336142226e-05, + "loss": 0.0668, + "step": 68000 + }, + { + "epoch": 4.44946025515211, + "grad_norm": 0.8934153318405151, + "learning_rate": 7.338512442973628e-05, + "loss": 0.0628, + "step": 68010 + }, + { + "epoch": 4.45011449133137, + "grad_norm": 0.9508333206176758, + "learning_rate": 7.337700470874103e-05, + "loss": 0.0582, + "step": 68020 + }, + { + "epoch": 4.450768727510631, + "grad_norm": 0.8118358254432678, + "learning_rate": 7.336888419871055e-05, + "loss": 0.0571, + "step": 68030 + }, + { + "epoch": 4.451422963689892, + "grad_norm": 0.7559012174606323, + "learning_rate": 7.336076289991895e-05, + "loss": 0.0645, + "step": 68040 + }, + { + "epoch": 4.452077199869152, + "grad_norm": 1.2024433612823486, + "learning_rate": 7.335264081264035e-05, + "loss": 0.0652, + "step": 68050 + }, + { + "epoch": 4.452731436048413, + "grad_norm": 0.8405249714851379, + "learning_rate": 7.334451793714885e-05, + "loss": 0.0679, + "step": 68060 + }, + { + "epoch": 4.453385672227674, + "grad_norm": 0.9205291271209717, + "learning_rate": 7.333639427371866e-05, + "loss": 0.0597, + "step": 68070 + }, + { + "epoch": 4.454039908406935, + "grad_norm": 1.0684325695037842, + "learning_rate": 7.332826982262395e-05, + "loss": 0.0664, + "step": 68080 + }, + { + "epoch": 4.454694144586195, + "grad_norm": 0.9805346727371216, + "learning_rate": 7.332014458413897e-05, + "loss": 0.0621, + "step": 68090 + }, + { + "epoch": 4.455348380765456, + "grad_norm": 0.7872211337089539, + "learning_rate": 7.331201855853794e-05, + "loss": 0.073, + "step": 68100 + }, + { + "epoch": 4.456002616944717, + "grad_norm": 0.7325934767723083, + "learning_rate": 7.330389174609515e-05, + "loss": 0.0638, + "step": 68110 + }, + { + "epoch": 4.456656853123977, + "grad_norm": 0.7671561241149902, + "learning_rate": 7.32957641470849e-05, + "loss": 0.0569, + "step": 68120 + }, + { + "epoch": 4.457311089303238, + "grad_norm": 0.9188522100448608, + "learning_rate": 7.328763576178151e-05, + "loss": 0.0645, + "step": 68130 + }, + { + "epoch": 4.457965325482499, + "grad_norm": 1.0198373794555664, + "learning_rate": 7.327950659045935e-05, + "loss": 0.0603, + "step": 68140 + }, + { + "epoch": 4.458619561661759, + "grad_norm": 0.9594274759292603, + "learning_rate": 7.327137663339276e-05, + "loss": 0.0574, + "step": 68150 + }, + { + "epoch": 4.45927379784102, + "grad_norm": 1.0565496683120728, + "learning_rate": 7.32632458908562e-05, + "loss": 0.0609, + "step": 68160 + }, + { + "epoch": 4.459928034020281, + "grad_norm": 0.9333295226097107, + "learning_rate": 7.325511436312408e-05, + "loss": 0.0769, + "step": 68170 + }, + { + "epoch": 4.460582270199542, + "grad_norm": 0.9997411966323853, + "learning_rate": 7.324698205047087e-05, + "loss": 0.0607, + "step": 68180 + }, + { + "epoch": 4.461236506378802, + "grad_norm": 1.1432136297225952, + "learning_rate": 7.323884895317102e-05, + "loss": 0.0573, + "step": 68190 + }, + { + "epoch": 4.461890742558063, + "grad_norm": 0.8411798477172852, + "learning_rate": 7.32307150714991e-05, + "loss": 0.0577, + "step": 68200 + }, + { + "epoch": 4.462544978737324, + "grad_norm": 0.8566927313804626, + "learning_rate": 7.32225804057296e-05, + "loss": 0.0617, + "step": 68210 + }, + { + "epoch": 4.463199214916585, + "grad_norm": 0.9064366221427917, + "learning_rate": 7.321444495613712e-05, + "loss": 0.0702, + "step": 68220 + }, + { + "epoch": 4.463853451095845, + "grad_norm": 0.6811468005180359, + "learning_rate": 7.320630872299624e-05, + "loss": 0.0588, + "step": 68230 + }, + { + "epoch": 4.464507687275106, + "grad_norm": 0.8559892177581787, + "learning_rate": 7.319817170658158e-05, + "loss": 0.0576, + "step": 68240 + }, + { + "epoch": 4.465161923454367, + "grad_norm": 0.7884718775749207, + "learning_rate": 7.319003390716779e-05, + "loss": 0.0656, + "step": 68250 + }, + { + "epoch": 4.465816159633627, + "grad_norm": 0.9382063150405884, + "learning_rate": 7.318189532502953e-05, + "loss": 0.0734, + "step": 68260 + }, + { + "epoch": 4.466470395812888, + "grad_norm": 0.9593857526779175, + "learning_rate": 7.317375596044152e-05, + "loss": 0.0686, + "step": 68270 + }, + { + "epoch": 4.467124631992149, + "grad_norm": 0.887768030166626, + "learning_rate": 7.316561581367845e-05, + "loss": 0.0603, + "step": 68280 + }, + { + "epoch": 4.4677788681714095, + "grad_norm": 0.8870118260383606, + "learning_rate": 7.315747488501509e-05, + "loss": 0.0611, + "step": 68290 + }, + { + "epoch": 4.46843310435067, + "grad_norm": 0.7766203284263611, + "learning_rate": 7.314933317472624e-05, + "loss": 0.0577, + "step": 68300 + }, + { + "epoch": 4.469087340529931, + "grad_norm": 0.6949660181999207, + "learning_rate": 7.314119068308668e-05, + "loss": 0.0602, + "step": 68310 + }, + { + "epoch": 4.469741576709192, + "grad_norm": 0.8689270615577698, + "learning_rate": 7.313304741037124e-05, + "loss": 0.0587, + "step": 68320 + }, + { + "epoch": 4.4703958128884524, + "grad_norm": 0.9906384348869324, + "learning_rate": 7.312490335685477e-05, + "loss": 0.0614, + "step": 68330 + }, + { + "epoch": 4.471050049067713, + "grad_norm": 0.8280085325241089, + "learning_rate": 7.311675852281218e-05, + "loss": 0.0673, + "step": 68340 + }, + { + "epoch": 4.471704285246974, + "grad_norm": 0.6465350985527039, + "learning_rate": 7.310861290851836e-05, + "loss": 0.0626, + "step": 68350 + }, + { + "epoch": 4.4723585214262345, + "grad_norm": 0.8401268720626831, + "learning_rate": 7.310046651424824e-05, + "loss": 0.0602, + "step": 68360 + }, + { + "epoch": 4.473012757605495, + "grad_norm": 0.9460334777832031, + "learning_rate": 7.309231934027681e-05, + "loss": 0.0631, + "step": 68370 + }, + { + "epoch": 4.473666993784756, + "grad_norm": 0.826092004776001, + "learning_rate": 7.308417138687902e-05, + "loss": 0.0529, + "step": 68380 + }, + { + "epoch": 4.474321229964017, + "grad_norm": 0.9380882978439331, + "learning_rate": 7.307602265432993e-05, + "loss": 0.0649, + "step": 68390 + }, + { + "epoch": 4.4749754661432775, + "grad_norm": 0.720436155796051, + "learning_rate": 7.306787314290455e-05, + "loss": 0.0636, + "step": 68400 + }, + { + "epoch": 4.475629702322538, + "grad_norm": 0.8422783017158508, + "learning_rate": 7.305972285287793e-05, + "loss": 0.0531, + "step": 68410 + }, + { + "epoch": 4.476283938501799, + "grad_norm": 0.8465076684951782, + "learning_rate": 7.30515717845252e-05, + "loss": 0.0689, + "step": 68420 + }, + { + "epoch": 4.4769381746810595, + "grad_norm": 0.7343236804008484, + "learning_rate": 7.304341993812149e-05, + "loss": 0.0649, + "step": 68430 + }, + { + "epoch": 4.4775924108603204, + "grad_norm": 1.0546813011169434, + "learning_rate": 7.30352673139419e-05, + "loss": 0.0678, + "step": 68440 + }, + { + "epoch": 4.478246647039581, + "grad_norm": 0.815166711807251, + "learning_rate": 7.302711391226163e-05, + "loss": 0.0632, + "step": 68450 + }, + { + "epoch": 4.478900883218842, + "grad_norm": 0.7304500341415405, + "learning_rate": 7.301895973335587e-05, + "loss": 0.0643, + "step": 68460 + }, + { + "epoch": 4.4795551193981025, + "grad_norm": 0.8881242871284485, + "learning_rate": 7.301080477749987e-05, + "loss": 0.0757, + "step": 68470 + }, + { + "epoch": 4.480209355577363, + "grad_norm": 0.8304605484008789, + "learning_rate": 7.300264904496883e-05, + "loss": 0.0634, + "step": 68480 + }, + { + "epoch": 4.480863591756624, + "grad_norm": 0.691174328327179, + "learning_rate": 7.299449253603808e-05, + "loss": 0.0649, + "step": 68490 + }, + { + "epoch": 4.4815178279358845, + "grad_norm": 0.9316902756690979, + "learning_rate": 7.29863352509829e-05, + "loss": 0.0646, + "step": 68500 + }, + { + "epoch": 4.4821720641151455, + "grad_norm": 0.9124321937561035, + "learning_rate": 7.297817719007861e-05, + "loss": 0.067, + "step": 68510 + }, + { + "epoch": 4.482826300294406, + "grad_norm": 1.0439180135726929, + "learning_rate": 7.297001835360058e-05, + "loss": 0.0721, + "step": 68520 + }, + { + "epoch": 4.483480536473667, + "grad_norm": 0.8428361415863037, + "learning_rate": 7.296185874182421e-05, + "loss": 0.0645, + "step": 68530 + }, + { + "epoch": 4.4841347726529275, + "grad_norm": 0.8357476592063904, + "learning_rate": 7.295369835502485e-05, + "loss": 0.0679, + "step": 68540 + }, + { + "epoch": 4.4847890088321885, + "grad_norm": 0.942311704158783, + "learning_rate": 7.2945537193478e-05, + "loss": 0.0604, + "step": 68550 + }, + { + "epoch": 4.485443245011449, + "grad_norm": 0.8047347068786621, + "learning_rate": 7.293737525745908e-05, + "loss": 0.0733, + "step": 68560 + }, + { + "epoch": 4.4860974811907095, + "grad_norm": 0.9718091487884521, + "learning_rate": 7.29292125472436e-05, + "loss": 0.0591, + "step": 68570 + }, + { + "epoch": 4.4867517173699705, + "grad_norm": 0.843663215637207, + "learning_rate": 7.292104906310707e-05, + "loss": 0.0692, + "step": 68580 + }, + { + "epoch": 4.487405953549231, + "grad_norm": 0.8390454649925232, + "learning_rate": 7.2912884805325e-05, + "loss": 0.0613, + "step": 68590 + }, + { + "epoch": 4.4880601897284915, + "grad_norm": 0.7371008396148682, + "learning_rate": 7.2904719774173e-05, + "loss": 0.057, + "step": 68600 + }, + { + "epoch": 4.4887144259077525, + "grad_norm": 1.1576838493347168, + "learning_rate": 7.289655396992661e-05, + "loss": 0.0655, + "step": 68610 + }, + { + "epoch": 4.4893686620870135, + "grad_norm": 1.0340911149978638, + "learning_rate": 7.28883873928615e-05, + "loss": 0.0626, + "step": 68620 + }, + { + "epoch": 4.490022898266274, + "grad_norm": 0.9780303835868835, + "learning_rate": 7.288022004325327e-05, + "loss": 0.0646, + "step": 68630 + }, + { + "epoch": 4.4906771344455345, + "grad_norm": 1.1134583950042725, + "learning_rate": 7.287205192137763e-05, + "loss": 0.0665, + "step": 68640 + }, + { + "epoch": 4.4913313706247955, + "grad_norm": 0.9058104157447815, + "learning_rate": 7.286388302751023e-05, + "loss": 0.0627, + "step": 68650 + }, + { + "epoch": 4.4919856068040565, + "grad_norm": 0.9129322171211243, + "learning_rate": 7.285571336192683e-05, + "loss": 0.0526, + "step": 68660 + }, + { + "epoch": 4.492639842983317, + "grad_norm": 0.7618893384933472, + "learning_rate": 7.284754292490314e-05, + "loss": 0.0584, + "step": 68670 + }, + { + "epoch": 4.4932940791625775, + "grad_norm": 0.9606322050094604, + "learning_rate": 7.283937171671498e-05, + "loss": 0.0634, + "step": 68680 + }, + { + "epoch": 4.4939483153418385, + "grad_norm": 0.9516724348068237, + "learning_rate": 7.283119973763813e-05, + "loss": 0.0665, + "step": 68690 + }, + { + "epoch": 4.494602551521099, + "grad_norm": 0.8869428038597107, + "learning_rate": 7.282302698794838e-05, + "loss": 0.0613, + "step": 68700 + }, + { + "epoch": 4.4952567877003595, + "grad_norm": 0.7321935892105103, + "learning_rate": 7.281485346792165e-05, + "loss": 0.0611, + "step": 68710 + }, + { + "epoch": 4.4959110238796205, + "grad_norm": 0.9069811701774597, + "learning_rate": 7.280667917783376e-05, + "loss": 0.0624, + "step": 68720 + }, + { + "epoch": 4.4965652600588815, + "grad_norm": 0.7614008188247681, + "learning_rate": 7.279850411796065e-05, + "loss": 0.0561, + "step": 68730 + }, + { + "epoch": 4.4972194962381415, + "grad_norm": 1.1257866621017456, + "learning_rate": 7.279032828857822e-05, + "loss": 0.0602, + "step": 68740 + }, + { + "epoch": 4.4978737324174025, + "grad_norm": 0.8876655101776123, + "learning_rate": 7.278215168996245e-05, + "loss": 0.0628, + "step": 68750 + }, + { + "epoch": 4.4985279685966635, + "grad_norm": 1.048601746559143, + "learning_rate": 7.27739743223893e-05, + "loss": 0.0645, + "step": 68760 + }, + { + "epoch": 4.4991822047759245, + "grad_norm": 0.7537106871604919, + "learning_rate": 7.27657961861348e-05, + "loss": 0.0627, + "step": 68770 + }, + { + "epoch": 4.4998364409551845, + "grad_norm": 0.9030201435089111, + "learning_rate": 7.275761728147497e-05, + "loss": 0.0707, + "step": 68780 + }, + { + "epoch": 4.5004906771344455, + "grad_norm": 0.799105703830719, + "learning_rate": 7.274943760868589e-05, + "loss": 0.0571, + "step": 68790 + }, + { + "epoch": 4.5011449133137065, + "grad_norm": 0.8551787734031677, + "learning_rate": 7.27412571680436e-05, + "loss": 0.0625, + "step": 68800 + }, + { + "epoch": 4.5017991494929674, + "grad_norm": 0.8719650506973267, + "learning_rate": 7.273307595982424e-05, + "loss": 0.0596, + "step": 68810 + }, + { + "epoch": 4.5024533856722275, + "grad_norm": 0.7927690148353577, + "learning_rate": 7.272489398430397e-05, + "loss": 0.0626, + "step": 68820 + }, + { + "epoch": 4.5031076218514885, + "grad_norm": 0.9607967138290405, + "learning_rate": 7.271671124175893e-05, + "loss": 0.0632, + "step": 68830 + }, + { + "epoch": 4.5037618580307495, + "grad_norm": 0.8309448957443237, + "learning_rate": 7.270852773246528e-05, + "loss": 0.0614, + "step": 68840 + }, + { + "epoch": 4.5044160942100095, + "grad_norm": 0.9089359045028687, + "learning_rate": 7.27003434566993e-05, + "loss": 0.0693, + "step": 68850 + }, + { + "epoch": 4.5050703303892705, + "grad_norm": 0.9338735342025757, + "learning_rate": 7.269215841473717e-05, + "loss": 0.0612, + "step": 68860 + }, + { + "epoch": 4.5057245665685315, + "grad_norm": 0.8226944804191589, + "learning_rate": 7.268397260685518e-05, + "loss": 0.068, + "step": 68870 + }, + { + "epoch": 4.506378802747792, + "grad_norm": 0.8187761902809143, + "learning_rate": 7.267578603332963e-05, + "loss": 0.0563, + "step": 68880 + }, + { + "epoch": 4.5070330389270525, + "grad_norm": 0.8850693702697754, + "learning_rate": 7.266759869443683e-05, + "loss": 0.0683, + "step": 68890 + }, + { + "epoch": 4.5076872751063135, + "grad_norm": 0.8132352232933044, + "learning_rate": 7.265941059045314e-05, + "loss": 0.0595, + "step": 68900 + }, + { + "epoch": 4.5083415112855745, + "grad_norm": 0.8258399367332458, + "learning_rate": 7.265122172165489e-05, + "loss": 0.0637, + "step": 68910 + }, + { + "epoch": 4.508995747464835, + "grad_norm": 0.7450704574584961, + "learning_rate": 7.264303208831854e-05, + "loss": 0.0644, + "step": 68920 + }, + { + "epoch": 4.5096499836440955, + "grad_norm": 0.7500464916229248, + "learning_rate": 7.263484169072044e-05, + "loss": 0.0666, + "step": 68930 + }, + { + "epoch": 4.5103042198233565, + "grad_norm": 0.8899851441383362, + "learning_rate": 7.262665052913707e-05, + "loss": 0.0559, + "step": 68940 + }, + { + "epoch": 4.510958456002617, + "grad_norm": 0.9518905282020569, + "learning_rate": 7.261845860384492e-05, + "loss": 0.0637, + "step": 68950 + }, + { + "epoch": 4.5116126921818775, + "grad_norm": 1.0154138803482056, + "learning_rate": 7.261026591512047e-05, + "loss": 0.0598, + "step": 68960 + }, + { + "epoch": 4.5122669283611385, + "grad_norm": 0.7621309757232666, + "learning_rate": 7.260207246324024e-05, + "loss": 0.0729, + "step": 68970 + }, + { + "epoch": 4.5129211645403995, + "grad_norm": 0.8984786868095398, + "learning_rate": 7.25938782484808e-05, + "loss": 0.0629, + "step": 68980 + }, + { + "epoch": 4.51357540071966, + "grad_norm": 0.9162548780441284, + "learning_rate": 7.25856832711187e-05, + "loss": 0.0639, + "step": 68990 + }, + { + "epoch": 4.5142296368989205, + "grad_norm": 0.844033420085907, + "learning_rate": 7.257748753143057e-05, + "loss": 0.0614, + "step": 69000 + }, + { + "epoch": 4.5148838730781815, + "grad_norm": 0.8666905760765076, + "learning_rate": 7.256929102969302e-05, + "loss": 0.0638, + "step": 69010 + }, + { + "epoch": 4.515538109257442, + "grad_norm": 0.9945257902145386, + "learning_rate": 7.256109376618271e-05, + "loss": 0.0602, + "step": 69020 + }, + { + "epoch": 4.516192345436703, + "grad_norm": 1.0383166074752808, + "learning_rate": 7.25528957411763e-05, + "loss": 0.0587, + "step": 69030 + }, + { + "epoch": 4.5168465816159635, + "grad_norm": 0.9124101400375366, + "learning_rate": 7.254469695495054e-05, + "loss": 0.0667, + "step": 69040 + }, + { + "epoch": 4.517500817795224, + "grad_norm": 0.7469764947891235, + "learning_rate": 7.253649740778212e-05, + "loss": 0.0641, + "step": 69050 + }, + { + "epoch": 4.518155053974485, + "grad_norm": 0.8720923662185669, + "learning_rate": 7.25282970999478e-05, + "loss": 0.0733, + "step": 69060 + }, + { + "epoch": 4.5188092901537456, + "grad_norm": 0.8793687224388123, + "learning_rate": 7.252009603172436e-05, + "loss": 0.0716, + "step": 69070 + }, + { + "epoch": 4.5194635263330065, + "grad_norm": 0.8005584478378296, + "learning_rate": 7.251189420338865e-05, + "loss": 0.0605, + "step": 69080 + }, + { + "epoch": 4.520117762512267, + "grad_norm": 0.868381142616272, + "learning_rate": 7.250369161521746e-05, + "loss": 0.0644, + "step": 69090 + }, + { + "epoch": 4.520771998691528, + "grad_norm": 0.8739510774612427, + "learning_rate": 7.249548826748764e-05, + "loss": 0.067, + "step": 69100 + }, + { + "epoch": 4.5214262348707885, + "grad_norm": 0.9072023034095764, + "learning_rate": 7.248728416047611e-05, + "loss": 0.0612, + "step": 69110 + }, + { + "epoch": 4.5220804710500495, + "grad_norm": 0.9014410972595215, + "learning_rate": 7.247907929445976e-05, + "loss": 0.0686, + "step": 69120 + }, + { + "epoch": 4.52273470722931, + "grad_norm": 0.883929967880249, + "learning_rate": 7.247087366971554e-05, + "loss": 0.054, + "step": 69130 + }, + { + "epoch": 4.523388943408571, + "grad_norm": 1.133589744567871, + "learning_rate": 7.24626672865204e-05, + "loss": 0.0698, + "step": 69140 + }, + { + "epoch": 4.5240431795878315, + "grad_norm": 0.7994188666343689, + "learning_rate": 7.245446014515132e-05, + "loss": 0.0606, + "step": 69150 + }, + { + "epoch": 4.524697415767092, + "grad_norm": 1.077908992767334, + "learning_rate": 7.244625224588533e-05, + "loss": 0.0638, + "step": 69160 + }, + { + "epoch": 4.525351651946353, + "grad_norm": 0.7507327795028687, + "learning_rate": 7.243804358899943e-05, + "loss": 0.0579, + "step": 69170 + }, + { + "epoch": 4.5260058881256136, + "grad_norm": 0.8101299405097961, + "learning_rate": 7.242983417477076e-05, + "loss": 0.0637, + "step": 69180 + }, + { + "epoch": 4.526660124304874, + "grad_norm": 0.8456845879554749, + "learning_rate": 7.242162400347634e-05, + "loss": 0.0586, + "step": 69190 + }, + { + "epoch": 4.527314360484135, + "grad_norm": 0.812188982963562, + "learning_rate": 7.24134130753933e-05, + "loss": 0.0627, + "step": 69200 + }, + { + "epoch": 4.527968596663396, + "grad_norm": 1.0019590854644775, + "learning_rate": 7.24052013907988e-05, + "loss": 0.0598, + "step": 69210 + }, + { + "epoch": 4.5286228328426565, + "grad_norm": 1.0545828342437744, + "learning_rate": 7.239698894996997e-05, + "loss": 0.062, + "step": 69220 + }, + { + "epoch": 4.529277069021917, + "grad_norm": 0.7775808572769165, + "learning_rate": 7.238877575318405e-05, + "loss": 0.0578, + "step": 69230 + }, + { + "epoch": 4.529931305201178, + "grad_norm": 0.9661263227462769, + "learning_rate": 7.238056180071823e-05, + "loss": 0.0685, + "step": 69240 + }, + { + "epoch": 4.530585541380439, + "grad_norm": 0.8217857480049133, + "learning_rate": 7.237234709284975e-05, + "loss": 0.0544, + "step": 69250 + }, + { + "epoch": 4.5312397775596995, + "grad_norm": 0.7686980962753296, + "learning_rate": 7.236413162985587e-05, + "loss": 0.06, + "step": 69260 + }, + { + "epoch": 4.53189401373896, + "grad_norm": 0.7001199722290039, + "learning_rate": 7.235591541201391e-05, + "loss": 0.0599, + "step": 69270 + }, + { + "epoch": 4.532548249918221, + "grad_norm": 0.6666156649589539, + "learning_rate": 7.234769843960116e-05, + "loss": 0.0668, + "step": 69280 + }, + { + "epoch": 4.5332024860974816, + "grad_norm": 0.9138648509979248, + "learning_rate": 7.233948071289499e-05, + "loss": 0.06, + "step": 69290 + }, + { + "epoch": 4.533856722276742, + "grad_norm": 1.0828460454940796, + "learning_rate": 7.233126223217275e-05, + "loss": 0.0742, + "step": 69300 + }, + { + "epoch": 4.534510958456003, + "grad_norm": 0.9059333801269531, + "learning_rate": 7.232304299771187e-05, + "loss": 0.0619, + "step": 69310 + }, + { + "epoch": 4.535165194635264, + "grad_norm": 0.7670040130615234, + "learning_rate": 7.231482300978971e-05, + "loss": 0.0635, + "step": 69320 + }, + { + "epoch": 4.535819430814524, + "grad_norm": 0.7467994689941406, + "learning_rate": 7.230660226868376e-05, + "loss": 0.0586, + "step": 69330 + }, + { + "epoch": 4.536473666993785, + "grad_norm": 0.8463179469108582, + "learning_rate": 7.22983807746715e-05, + "loss": 0.0564, + "step": 69340 + }, + { + "epoch": 4.537127903173046, + "grad_norm": 0.6910958290100098, + "learning_rate": 7.22901585280304e-05, + "loss": 0.0637, + "step": 69350 + }, + { + "epoch": 4.537782139352307, + "grad_norm": 0.78616863489151, + "learning_rate": 7.228193552903798e-05, + "loss": 0.059, + "step": 69360 + }, + { + "epoch": 4.538436375531567, + "grad_norm": 0.934597373008728, + "learning_rate": 7.227371177797181e-05, + "loss": 0.0633, + "step": 69370 + }, + { + "epoch": 4.539090611710828, + "grad_norm": 0.9315169453620911, + "learning_rate": 7.226548727510945e-05, + "loss": 0.0573, + "step": 69380 + }, + { + "epoch": 4.539744847890089, + "grad_norm": 1.0546493530273438, + "learning_rate": 7.22572620207285e-05, + "loss": 0.069, + "step": 69390 + }, + { + "epoch": 4.540399084069349, + "grad_norm": 0.7650620937347412, + "learning_rate": 7.224903601510658e-05, + "loss": 0.0564, + "step": 69400 + }, + { + "epoch": 4.54105332024861, + "grad_norm": 0.7123848795890808, + "learning_rate": 7.224080925852136e-05, + "loss": 0.065, + "step": 69410 + }, + { + "epoch": 4.541707556427871, + "grad_norm": 0.8295034766197205, + "learning_rate": 7.22325817512505e-05, + "loss": 0.0546, + "step": 69420 + }, + { + "epoch": 4.542361792607132, + "grad_norm": 0.9228529930114746, + "learning_rate": 7.222435349357169e-05, + "loss": 0.0676, + "step": 69430 + }, + { + "epoch": 4.543016028786392, + "grad_norm": 0.7449051141738892, + "learning_rate": 7.221612448576266e-05, + "loss": 0.0575, + "step": 69440 + }, + { + "epoch": 4.543670264965653, + "grad_norm": 0.832432210445404, + "learning_rate": 7.220789472810115e-05, + "loss": 0.0595, + "step": 69450 + }, + { + "epoch": 4.544324501144914, + "grad_norm": 1.0171115398406982, + "learning_rate": 7.219966422086497e-05, + "loss": 0.0561, + "step": 69460 + }, + { + "epoch": 4.544978737324174, + "grad_norm": 0.8047521710395813, + "learning_rate": 7.219143296433191e-05, + "loss": 0.0636, + "step": 69470 + }, + { + "epoch": 4.545632973503435, + "grad_norm": 0.7108994126319885, + "learning_rate": 7.218320095877976e-05, + "loss": 0.0626, + "step": 69480 + }, + { + "epoch": 4.546287209682696, + "grad_norm": 0.9195765256881714, + "learning_rate": 7.217496820448642e-05, + "loss": 0.0578, + "step": 69490 + }, + { + "epoch": 4.546941445861956, + "grad_norm": 1.0596858263015747, + "learning_rate": 7.216673470172975e-05, + "loss": 0.0603, + "step": 69500 + }, + { + "epoch": 4.547595682041217, + "grad_norm": 0.7993507385253906, + "learning_rate": 7.215850045078765e-05, + "loss": 0.0695, + "step": 69510 + }, + { + "epoch": 4.548249918220478, + "grad_norm": 1.125396728515625, + "learning_rate": 7.215026545193802e-05, + "loss": 0.0666, + "step": 69520 + }, + { + "epoch": 4.548904154399739, + "grad_norm": 0.8783840537071228, + "learning_rate": 7.214202970545888e-05, + "loss": 0.0567, + "step": 69530 + }, + { + "epoch": 4.549558390578999, + "grad_norm": 0.7901387214660645, + "learning_rate": 7.213379321162814e-05, + "loss": 0.0585, + "step": 69540 + }, + { + "epoch": 4.55021262675826, + "grad_norm": 0.8331219553947449, + "learning_rate": 7.212555597072384e-05, + "loss": 0.0596, + "step": 69550 + }, + { + "epoch": 4.550866862937521, + "grad_norm": 0.8178802132606506, + "learning_rate": 7.2117317983024e-05, + "loss": 0.0582, + "step": 69560 + }, + { + "epoch": 4.551521099116782, + "grad_norm": 0.7966588139533997, + "learning_rate": 7.210907924880668e-05, + "loss": 0.0586, + "step": 69570 + }, + { + "epoch": 4.552175335296042, + "grad_norm": 0.7240191698074341, + "learning_rate": 7.210083976834994e-05, + "loss": 0.0554, + "step": 69580 + }, + { + "epoch": 4.552829571475303, + "grad_norm": 0.6991345882415771, + "learning_rate": 7.20925995419319e-05, + "loss": 0.0614, + "step": 69590 + }, + { + "epoch": 4.553483807654564, + "grad_norm": 0.9733833074569702, + "learning_rate": 7.208435856983068e-05, + "loss": 0.0631, + "step": 69600 + }, + { + "epoch": 4.554138043833824, + "grad_norm": 0.8843761086463928, + "learning_rate": 7.207611685232447e-05, + "loss": 0.0647, + "step": 69610 + }, + { + "epoch": 4.554792280013085, + "grad_norm": 1.0091403722763062, + "learning_rate": 7.206787438969138e-05, + "loss": 0.0581, + "step": 69620 + }, + { + "epoch": 4.555446516192346, + "grad_norm": 1.0898354053497314, + "learning_rate": 7.205963118220967e-05, + "loss": 0.0744, + "step": 69630 + }, + { + "epoch": 4.556100752371606, + "grad_norm": 0.7941951155662537, + "learning_rate": 7.205138723015756e-05, + "loss": 0.0728, + "step": 69640 + }, + { + "epoch": 4.556754988550867, + "grad_norm": 1.1205252408981323, + "learning_rate": 7.204314253381329e-05, + "loss": 0.0603, + "step": 69650 + }, + { + "epoch": 4.557409224730128, + "grad_norm": 0.9061139225959778, + "learning_rate": 7.203489709345515e-05, + "loss": 0.0613, + "step": 69660 + }, + { + "epoch": 4.558063460909389, + "grad_norm": 0.958292543888092, + "learning_rate": 7.202665090936145e-05, + "loss": 0.0667, + "step": 69670 + }, + { + "epoch": 4.558717697088649, + "grad_norm": 0.6560031771659851, + "learning_rate": 7.201840398181052e-05, + "loss": 0.0674, + "step": 69680 + }, + { + "epoch": 4.55937193326791, + "grad_norm": 0.8264176845550537, + "learning_rate": 7.201015631108071e-05, + "loss": 0.0682, + "step": 69690 + }, + { + "epoch": 4.560026169447171, + "grad_norm": 1.0350055694580078, + "learning_rate": 7.200190789745038e-05, + "loss": 0.061, + "step": 69700 + }, + { + "epoch": 4.560680405626432, + "grad_norm": 0.8044286370277405, + "learning_rate": 7.199365874119796e-05, + "loss": 0.0655, + "step": 69710 + }, + { + "epoch": 4.561334641805692, + "grad_norm": 0.9388670325279236, + "learning_rate": 7.198540884260189e-05, + "loss": 0.06, + "step": 69720 + }, + { + "epoch": 4.561988877984953, + "grad_norm": 0.7581605315208435, + "learning_rate": 7.197715820194062e-05, + "loss": 0.0535, + "step": 69730 + }, + { + "epoch": 4.562643114164214, + "grad_norm": 1.0746386051177979, + "learning_rate": 7.19689068194926e-05, + "loss": 0.0575, + "step": 69740 + }, + { + "epoch": 4.563297350343474, + "grad_norm": 0.7626873254776001, + "learning_rate": 7.196065469553637e-05, + "loss": 0.0616, + "step": 69750 + }, + { + "epoch": 4.563951586522735, + "grad_norm": 0.9289671182632446, + "learning_rate": 7.195240183035045e-05, + "loss": 0.0586, + "step": 69760 + }, + { + "epoch": 4.564605822701996, + "grad_norm": 0.8049091100692749, + "learning_rate": 7.194414822421341e-05, + "loss": 0.0527, + "step": 69770 + }, + { + "epoch": 4.565260058881256, + "grad_norm": 0.7008090615272522, + "learning_rate": 7.19358938774038e-05, + "loss": 0.0763, + "step": 69780 + }, + { + "epoch": 4.565914295060517, + "grad_norm": 1.1009507179260254, + "learning_rate": 7.192763879020025e-05, + "loss": 0.0732, + "step": 69790 + }, + { + "epoch": 4.566568531239778, + "grad_norm": 0.9115166068077087, + "learning_rate": 7.191938296288138e-05, + "loss": 0.0635, + "step": 69800 + }, + { + "epoch": 4.567222767419039, + "grad_norm": 0.9855472445487976, + "learning_rate": 7.191112639572585e-05, + "loss": 0.0594, + "step": 69810 + }, + { + "epoch": 4.567877003598299, + "grad_norm": 1.0158358812332153, + "learning_rate": 7.190286908901234e-05, + "loss": 0.0598, + "step": 69820 + }, + { + "epoch": 4.56853123977756, + "grad_norm": 0.7461121082305908, + "learning_rate": 7.189461104301955e-05, + "loss": 0.0578, + "step": 69830 + }, + { + "epoch": 4.569185475956821, + "grad_norm": 0.8417521119117737, + "learning_rate": 7.188635225802622e-05, + "loss": 0.0617, + "step": 69840 + }, + { + "epoch": 4.569839712136081, + "grad_norm": 0.9156501293182373, + "learning_rate": 7.187809273431112e-05, + "loss": 0.0567, + "step": 69850 + }, + { + "epoch": 4.570493948315342, + "grad_norm": 1.0383937358856201, + "learning_rate": 7.186983247215299e-05, + "loss": 0.0659, + "step": 69860 + }, + { + "epoch": 4.571148184494603, + "grad_norm": 0.6928499937057495, + "learning_rate": 7.186157147183067e-05, + "loss": 0.066, + "step": 69870 + }, + { + "epoch": 4.571802420673864, + "grad_norm": 0.7794655561447144, + "learning_rate": 7.185330973362298e-05, + "loss": 0.0602, + "step": 69880 + }, + { + "epoch": 4.572456656853124, + "grad_norm": 0.9522107839584351, + "learning_rate": 7.184504725780876e-05, + "loss": 0.0602, + "step": 69890 + }, + { + "epoch": 4.573110893032385, + "grad_norm": 0.9727455377578735, + "learning_rate": 7.183678404466691e-05, + "loss": 0.061, + "step": 69900 + }, + { + "epoch": 4.573765129211646, + "grad_norm": 0.8245759606361389, + "learning_rate": 7.182852009447633e-05, + "loss": 0.07, + "step": 69910 + }, + { + "epoch": 4.574419365390906, + "grad_norm": 0.9080676436424255, + "learning_rate": 7.182025540751595e-05, + "loss": 0.0647, + "step": 69920 + }, + { + "epoch": 4.575073601570167, + "grad_norm": 0.8655606508255005, + "learning_rate": 7.181198998406473e-05, + "loss": 0.0543, + "step": 69930 + }, + { + "epoch": 4.575727837749428, + "grad_norm": 1.0954169034957886, + "learning_rate": 7.180372382440164e-05, + "loss": 0.064, + "step": 69940 + }, + { + "epoch": 4.576382073928688, + "grad_norm": 0.817754328250885, + "learning_rate": 7.179545692880569e-05, + "loss": 0.0651, + "step": 69950 + }, + { + "epoch": 4.577036310107949, + "grad_norm": 0.6452881693840027, + "learning_rate": 7.17871892975559e-05, + "loss": 0.0624, + "step": 69960 + }, + { + "epoch": 4.57769054628721, + "grad_norm": 0.8822810053825378, + "learning_rate": 7.177892093093134e-05, + "loss": 0.0512, + "step": 69970 + }, + { + "epoch": 4.578344782466471, + "grad_norm": 0.7924016714096069, + "learning_rate": 7.17706518292111e-05, + "loss": 0.0663, + "step": 69980 + }, + { + "epoch": 4.578999018645731, + "grad_norm": 1.0157910585403442, + "learning_rate": 7.176238199267424e-05, + "loss": 0.0615, + "step": 69990 + }, + { + "epoch": 4.579653254824992, + "grad_norm": 0.6557690501213074, + "learning_rate": 7.175411142159991e-05, + "loss": 0.0521, + "step": 70000 + }, + { + "epoch": 4.580307491004253, + "grad_norm": 0.7803752422332764, + "learning_rate": 7.174584011626728e-05, + "loss": 0.0611, + "step": 70010 + }, + { + "epoch": 4.580961727183514, + "grad_norm": 0.889769971370697, + "learning_rate": 7.17375680769555e-05, + "loss": 0.0615, + "step": 70020 + }, + { + "epoch": 4.581615963362774, + "grad_norm": 0.706065833568573, + "learning_rate": 7.17292953039438e-05, + "loss": 0.0589, + "step": 70030 + }, + { + "epoch": 4.582270199542035, + "grad_norm": 0.9106086492538452, + "learning_rate": 7.172102179751141e-05, + "loss": 0.0638, + "step": 70040 + }, + { + "epoch": 4.582924435721296, + "grad_norm": 0.7716435790061951, + "learning_rate": 7.171274755793756e-05, + "loss": 0.0571, + "step": 70050 + }, + { + "epoch": 4.583578671900556, + "grad_norm": 0.8700406551361084, + "learning_rate": 7.170447258550152e-05, + "loss": 0.0659, + "step": 70060 + }, + { + "epoch": 4.584232908079817, + "grad_norm": 0.902850329875946, + "learning_rate": 7.169619688048262e-05, + "loss": 0.06, + "step": 70070 + }, + { + "epoch": 4.584887144259078, + "grad_norm": 1.30494225025177, + "learning_rate": 7.168792044316017e-05, + "loss": 0.0595, + "step": 70080 + }, + { + "epoch": 4.585541380438338, + "grad_norm": 0.878594160079956, + "learning_rate": 7.167964327381355e-05, + "loss": 0.0602, + "step": 70090 + }, + { + "epoch": 4.586195616617599, + "grad_norm": 1.009494662284851, + "learning_rate": 7.167136537272208e-05, + "loss": 0.0622, + "step": 70100 + }, + { + "epoch": 4.58684985279686, + "grad_norm": 0.858677864074707, + "learning_rate": 7.166308674016522e-05, + "loss": 0.0642, + "step": 70110 + }, + { + "epoch": 4.587504088976121, + "grad_norm": 0.8348826169967651, + "learning_rate": 7.165480737642234e-05, + "loss": 0.0596, + "step": 70120 + }, + { + "epoch": 4.588158325155381, + "grad_norm": 0.9198652505874634, + "learning_rate": 7.164652728177294e-05, + "loss": 0.0584, + "step": 70130 + }, + { + "epoch": 4.588812561334642, + "grad_norm": 0.8578175902366638, + "learning_rate": 7.163824645649648e-05, + "loss": 0.0597, + "step": 70140 + }, + { + "epoch": 4.589466797513903, + "grad_norm": 0.8815035223960876, + "learning_rate": 7.162996490087243e-05, + "loss": 0.0548, + "step": 70150 + }, + { + "epoch": 4.590121033693164, + "grad_norm": 0.8675861954689026, + "learning_rate": 7.162168261518036e-05, + "loss": 0.0571, + "step": 70160 + }, + { + "epoch": 4.590775269872424, + "grad_norm": 0.9907076358795166, + "learning_rate": 7.161339959969979e-05, + "loss": 0.0618, + "step": 70170 + }, + { + "epoch": 4.591429506051685, + "grad_norm": 0.8220086693763733, + "learning_rate": 7.160511585471031e-05, + "loss": 0.0592, + "step": 70180 + }, + { + "epoch": 4.592083742230946, + "grad_norm": 0.9038228988647461, + "learning_rate": 7.159683138049148e-05, + "loss": 0.0595, + "step": 70190 + }, + { + "epoch": 4.592737978410206, + "grad_norm": 0.9181106686592102, + "learning_rate": 7.158854617732297e-05, + "loss": 0.0556, + "step": 70200 + }, + { + "epoch": 4.593392214589467, + "grad_norm": 0.8551499843597412, + "learning_rate": 7.158026024548441e-05, + "loss": 0.0594, + "step": 70210 + }, + { + "epoch": 4.594046450768728, + "grad_norm": 0.9677573442459106, + "learning_rate": 7.157197358525546e-05, + "loss": 0.06, + "step": 70220 + }, + { + "epoch": 4.594700686947988, + "grad_norm": 0.8928607702255249, + "learning_rate": 7.156368619691582e-05, + "loss": 0.0647, + "step": 70230 + }, + { + "epoch": 4.595354923127249, + "grad_norm": 0.8647032976150513, + "learning_rate": 7.155539808074525e-05, + "loss": 0.0641, + "step": 70240 + }, + { + "epoch": 4.59600915930651, + "grad_norm": 0.9325335621833801, + "learning_rate": 7.154710923702345e-05, + "loss": 0.0613, + "step": 70250 + }, + { + "epoch": 4.596663395485771, + "grad_norm": 0.8028609156608582, + "learning_rate": 7.153881966603019e-05, + "loss": 0.0595, + "step": 70260 + }, + { + "epoch": 4.597317631665031, + "grad_norm": 0.9629660248756409, + "learning_rate": 7.153052936804529e-05, + "loss": 0.0555, + "step": 70270 + }, + { + "epoch": 4.597971867844292, + "grad_norm": 0.9992688298225403, + "learning_rate": 7.152223834334855e-05, + "loss": 0.0691, + "step": 70280 + }, + { + "epoch": 4.598626104023553, + "grad_norm": 0.7620879411697388, + "learning_rate": 7.151394659221984e-05, + "loss": 0.0601, + "step": 70290 + }, + { + "epoch": 4.599280340202813, + "grad_norm": 0.883135199546814, + "learning_rate": 7.150565411493899e-05, + "loss": 0.0554, + "step": 70300 + }, + { + "epoch": 4.599934576382074, + "grad_norm": 0.8866992592811584, + "learning_rate": 7.149736091178593e-05, + "loss": 0.0674, + "step": 70310 + }, + { + "epoch": 4.600588812561335, + "grad_norm": 1.0410959720611572, + "learning_rate": 7.148906698304054e-05, + "loss": 0.0634, + "step": 70320 + }, + { + "epoch": 4.601243048740596, + "grad_norm": 0.7747849225997925, + "learning_rate": 7.14807723289828e-05, + "loss": 0.0543, + "step": 70330 + }, + { + "epoch": 4.601897284919856, + "grad_norm": 0.9914830923080444, + "learning_rate": 7.147247694989265e-05, + "loss": 0.058, + "step": 70340 + }, + { + "epoch": 4.602551521099117, + "grad_norm": 0.8402534127235413, + "learning_rate": 7.146418084605008e-05, + "loss": 0.0634, + "step": 70350 + }, + { + "epoch": 4.603205757278378, + "grad_norm": 0.840990424156189, + "learning_rate": 7.145588401773513e-05, + "loss": 0.0559, + "step": 70360 + }, + { + "epoch": 4.603859993457638, + "grad_norm": 0.8721504807472229, + "learning_rate": 7.144758646522782e-05, + "loss": 0.0638, + "step": 70370 + }, + { + "epoch": 4.604514229636899, + "grad_norm": 0.8527781963348389, + "learning_rate": 7.143928818880823e-05, + "loss": 0.0532, + "step": 70380 + }, + { + "epoch": 4.60516846581616, + "grad_norm": 1.0726641416549683, + "learning_rate": 7.143098918875643e-05, + "loss": 0.0602, + "step": 70390 + }, + { + "epoch": 4.60582270199542, + "grad_norm": 1.0144989490509033, + "learning_rate": 7.142268946535254e-05, + "loss": 0.0642, + "step": 70400 + }, + { + "epoch": 4.606476938174681, + "grad_norm": 1.0396263599395752, + "learning_rate": 7.141438901887669e-05, + "loss": 0.0704, + "step": 70410 + }, + { + "epoch": 4.607131174353942, + "grad_norm": 0.7926657199859619, + "learning_rate": 7.140608784960904e-05, + "loss": 0.0692, + "step": 70420 + }, + { + "epoch": 4.607785410533203, + "grad_norm": 1.1654900312423706, + "learning_rate": 7.13977859578298e-05, + "loss": 0.0588, + "step": 70430 + }, + { + "epoch": 4.608439646712463, + "grad_norm": 0.9347232580184937, + "learning_rate": 7.138948334381917e-05, + "loss": 0.0604, + "step": 70440 + }, + { + "epoch": 4.609093882891724, + "grad_norm": 1.0925201177597046, + "learning_rate": 7.138118000785736e-05, + "loss": 0.0708, + "step": 70450 + }, + { + "epoch": 4.609748119070985, + "grad_norm": 0.9445613026618958, + "learning_rate": 7.137287595022467e-05, + "loss": 0.0523, + "step": 70460 + }, + { + "epoch": 4.610402355250246, + "grad_norm": 0.795276939868927, + "learning_rate": 7.136457117120136e-05, + "loss": 0.0655, + "step": 70470 + }, + { + "epoch": 4.611056591429506, + "grad_norm": 0.9454329609870911, + "learning_rate": 7.135626567106775e-05, + "loss": 0.0694, + "step": 70480 + }, + { + "epoch": 4.611710827608767, + "grad_norm": 0.8667651414871216, + "learning_rate": 7.134795945010416e-05, + "loss": 0.0673, + "step": 70490 + }, + { + "epoch": 4.612365063788028, + "grad_norm": 0.7597100734710693, + "learning_rate": 7.133965250859094e-05, + "loss": 0.0647, + "step": 70500 + }, + { + "epoch": 4.613019299967288, + "grad_norm": 1.0633947849273682, + "learning_rate": 7.13313448468085e-05, + "loss": 0.0676, + "step": 70510 + }, + { + "epoch": 4.613673536146549, + "grad_norm": 0.9230862259864807, + "learning_rate": 7.13230364650372e-05, + "loss": 0.0718, + "step": 70520 + }, + { + "epoch": 4.61432777232581, + "grad_norm": 0.8642282485961914, + "learning_rate": 7.131472736355754e-05, + "loss": 0.0678, + "step": 70530 + }, + { + "epoch": 4.61498200850507, + "grad_norm": 0.8637971878051758, + "learning_rate": 7.130641754264991e-05, + "loss": 0.0718, + "step": 70540 + }, + { + "epoch": 4.615636244684331, + "grad_norm": 0.943467915058136, + "learning_rate": 7.12981070025948e-05, + "loss": 0.0527, + "step": 70550 + }, + { + "epoch": 4.616290480863592, + "grad_norm": 0.843879222869873, + "learning_rate": 7.128979574367272e-05, + "loss": 0.0649, + "step": 70560 + }, + { + "epoch": 4.616944717042853, + "grad_norm": 0.9107741713523865, + "learning_rate": 7.128148376616422e-05, + "loss": 0.0617, + "step": 70570 + }, + { + "epoch": 4.617598953222113, + "grad_norm": 0.8617720007896423, + "learning_rate": 7.127317107034981e-05, + "loss": 0.061, + "step": 70580 + }, + { + "epoch": 4.618253189401374, + "grad_norm": 0.8260530829429626, + "learning_rate": 7.12648576565101e-05, + "loss": 0.0536, + "step": 70590 + }, + { + "epoch": 4.618907425580635, + "grad_norm": 0.9773645401000977, + "learning_rate": 7.125654352492567e-05, + "loss": 0.0531, + "step": 70600 + }, + { + "epoch": 4.619561661759896, + "grad_norm": 1.027761459350586, + "learning_rate": 7.124822867587715e-05, + "loss": 0.0566, + "step": 70610 + }, + { + "epoch": 4.620215897939156, + "grad_norm": 1.0088636875152588, + "learning_rate": 7.123991310964519e-05, + "loss": 0.0577, + "step": 70620 + }, + { + "epoch": 4.620870134118417, + "grad_norm": 0.9433162212371826, + "learning_rate": 7.123159682651045e-05, + "loss": 0.06, + "step": 70630 + }, + { + "epoch": 4.621524370297678, + "grad_norm": 1.0811787843704224, + "learning_rate": 7.122327982675363e-05, + "loss": 0.0597, + "step": 70640 + }, + { + "epoch": 4.622178606476938, + "grad_norm": 0.8905490636825562, + "learning_rate": 7.121496211065547e-05, + "loss": 0.0581, + "step": 70650 + }, + { + "epoch": 4.622832842656199, + "grad_norm": 0.801537811756134, + "learning_rate": 7.12066436784967e-05, + "loss": 0.0565, + "step": 70660 + }, + { + "epoch": 4.62348707883546, + "grad_norm": 0.9896419048309326, + "learning_rate": 7.119832453055809e-05, + "loss": 0.0645, + "step": 70670 + }, + { + "epoch": 4.62414131501472, + "grad_norm": 0.9551672339439392, + "learning_rate": 7.119000466712042e-05, + "loss": 0.058, + "step": 70680 + }, + { + "epoch": 4.624795551193981, + "grad_norm": 0.8658273220062256, + "learning_rate": 7.118168408846454e-05, + "loss": 0.0601, + "step": 70690 + }, + { + "epoch": 4.625449787373242, + "grad_norm": 0.7730889916419983, + "learning_rate": 7.117336279487124e-05, + "loss": 0.0589, + "step": 70700 + }, + { + "epoch": 4.626104023552503, + "grad_norm": 1.2071642875671387, + "learning_rate": 7.116504078662144e-05, + "loss": 0.0607, + "step": 70710 + }, + { + "epoch": 4.626758259731763, + "grad_norm": 0.6655055284500122, + "learning_rate": 7.1156718063996e-05, + "loss": 0.0672, + "step": 70720 + }, + { + "epoch": 4.627412495911024, + "grad_norm": 0.765287458896637, + "learning_rate": 7.114839462727585e-05, + "loss": 0.0657, + "step": 70730 + }, + { + "epoch": 4.628066732090285, + "grad_norm": 0.9906727075576782, + "learning_rate": 7.114007047674189e-05, + "loss": 0.0612, + "step": 70740 + }, + { + "epoch": 4.628720968269545, + "grad_norm": 0.8197884559631348, + "learning_rate": 7.113174561267514e-05, + "loss": 0.0638, + "step": 70750 + }, + { + "epoch": 4.629375204448806, + "grad_norm": 0.8256412744522095, + "learning_rate": 7.112342003535654e-05, + "loss": 0.062, + "step": 70760 + }, + { + "epoch": 4.630029440628067, + "grad_norm": 1.0937832593917847, + "learning_rate": 7.111509374506711e-05, + "loss": 0.0612, + "step": 70770 + }, + { + "epoch": 4.630683676807328, + "grad_norm": 0.8414881825447083, + "learning_rate": 7.11067667420879e-05, + "loss": 0.0588, + "step": 70780 + }, + { + "epoch": 4.631337912986588, + "grad_norm": 1.1070261001586914, + "learning_rate": 7.109843902669997e-05, + "loss": 0.0754, + "step": 70790 + }, + { + "epoch": 4.631992149165849, + "grad_norm": 1.002337098121643, + "learning_rate": 7.109011059918438e-05, + "loss": 0.062, + "step": 70800 + }, + { + "epoch": 4.63264638534511, + "grad_norm": 0.7594942450523376, + "learning_rate": 7.108178145982223e-05, + "loss": 0.06, + "step": 70810 + }, + { + "epoch": 4.63330062152437, + "grad_norm": 0.8567973375320435, + "learning_rate": 7.107345160889469e-05, + "loss": 0.0627, + "step": 70820 + }, + { + "epoch": 4.633954857703631, + "grad_norm": 0.919387936592102, + "learning_rate": 7.106512104668287e-05, + "loss": 0.0654, + "step": 70830 + }, + { + "epoch": 4.634609093882892, + "grad_norm": 0.9777571558952332, + "learning_rate": 7.1056789773468e-05, + "loss": 0.0661, + "step": 70840 + }, + { + "epoch": 4.635263330062152, + "grad_norm": 0.7844616174697876, + "learning_rate": 7.104845778953122e-05, + "loss": 0.0597, + "step": 70850 + }, + { + "epoch": 4.635917566241413, + "grad_norm": 0.8564948439598083, + "learning_rate": 7.10401250951538e-05, + "loss": 0.0679, + "step": 70860 + }, + { + "epoch": 4.636571802420674, + "grad_norm": 0.8882778286933899, + "learning_rate": 7.103179169061697e-05, + "loss": 0.0637, + "step": 70870 + }, + { + "epoch": 4.637226038599935, + "grad_norm": 0.8899109363555908, + "learning_rate": 7.102345757620204e-05, + "loss": 0.0639, + "step": 70880 + }, + { + "epoch": 4.637880274779195, + "grad_norm": 0.7381958961486816, + "learning_rate": 7.101512275219026e-05, + "loss": 0.0618, + "step": 70890 + }, + { + "epoch": 4.638534510958456, + "grad_norm": 1.1825381517410278, + "learning_rate": 7.100678721886296e-05, + "loss": 0.069, + "step": 70900 + }, + { + "epoch": 4.639188747137717, + "grad_norm": 0.8245072960853577, + "learning_rate": 7.099845097650152e-05, + "loss": 0.0648, + "step": 70910 + }, + { + "epoch": 4.639842983316978, + "grad_norm": 1.0069905519485474, + "learning_rate": 7.099011402538729e-05, + "loss": 0.0565, + "step": 70920 + }, + { + "epoch": 4.640497219496238, + "grad_norm": 0.9455946087837219, + "learning_rate": 7.098177636580165e-05, + "loss": 0.0644, + "step": 70930 + }, + { + "epoch": 4.641151455675499, + "grad_norm": 1.0115948915481567, + "learning_rate": 7.097343799802603e-05, + "loss": 0.0598, + "step": 70940 + }, + { + "epoch": 4.64180569185476, + "grad_norm": 0.910192608833313, + "learning_rate": 7.096509892234188e-05, + "loss": 0.062, + "step": 70950 + }, + { + "epoch": 4.64245992803402, + "grad_norm": 1.005014419555664, + "learning_rate": 7.095675913903067e-05, + "loss": 0.0573, + "step": 70960 + }, + { + "epoch": 4.643114164213281, + "grad_norm": 0.8233394622802734, + "learning_rate": 7.094841864837385e-05, + "loss": 0.0607, + "step": 70970 + }, + { + "epoch": 4.643768400392542, + "grad_norm": 0.84469074010849, + "learning_rate": 7.094007745065298e-05, + "loss": 0.0547, + "step": 70980 + }, + { + "epoch": 4.644422636571802, + "grad_norm": 0.846990168094635, + "learning_rate": 7.093173554614958e-05, + "loss": 0.0596, + "step": 70990 + }, + { + "epoch": 4.645076872751063, + "grad_norm": 1.1320626735687256, + "learning_rate": 7.092339293514521e-05, + "loss": 0.0595, + "step": 71000 + }, + { + "epoch": 4.645731108930324, + "grad_norm": 0.9012898206710815, + "learning_rate": 7.091504961792145e-05, + "loss": 0.0565, + "step": 71010 + }, + { + "epoch": 4.646385345109585, + "grad_norm": 0.9785043001174927, + "learning_rate": 7.090670559475991e-05, + "loss": 0.0636, + "step": 71020 + }, + { + "epoch": 4.647039581288845, + "grad_norm": 0.9479057788848877, + "learning_rate": 7.089836086594223e-05, + "loss": 0.063, + "step": 71030 + }, + { + "epoch": 4.647693817468106, + "grad_norm": 0.8791054487228394, + "learning_rate": 7.089001543175007e-05, + "loss": 0.0666, + "step": 71040 + }, + { + "epoch": 4.648348053647367, + "grad_norm": 1.0434389114379883, + "learning_rate": 7.088166929246509e-05, + "loss": 0.068, + "step": 71050 + }, + { + "epoch": 4.649002289826628, + "grad_norm": 0.8566122651100159, + "learning_rate": 7.087332244836901e-05, + "loss": 0.0675, + "step": 71060 + }, + { + "epoch": 4.649656526005888, + "grad_norm": 0.9243730902671814, + "learning_rate": 7.086497489974355e-05, + "loss": 0.0639, + "step": 71070 + }, + { + "epoch": 4.650310762185149, + "grad_norm": 0.8859608173370361, + "learning_rate": 7.085662664687049e-05, + "loss": 0.06, + "step": 71080 + }, + { + "epoch": 4.65096499836441, + "grad_norm": 1.0583763122558594, + "learning_rate": 7.084827769003157e-05, + "loss": 0.0573, + "step": 71090 + }, + { + "epoch": 4.65161923454367, + "grad_norm": 0.8830341100692749, + "learning_rate": 7.083992802950859e-05, + "loss": 0.0557, + "step": 71100 + }, + { + "epoch": 4.652273470722931, + "grad_norm": 0.8435945510864258, + "learning_rate": 7.08315776655834e-05, + "loss": 0.0535, + "step": 71110 + }, + { + "epoch": 4.652927706902192, + "grad_norm": 0.9333560466766357, + "learning_rate": 7.082322659853782e-05, + "loss": 0.065, + "step": 71120 + }, + { + "epoch": 4.653581943081452, + "grad_norm": 0.8376295566558838, + "learning_rate": 7.081487482865375e-05, + "loss": 0.0518, + "step": 71130 + }, + { + "epoch": 4.654236179260713, + "grad_norm": 0.7499420642852783, + "learning_rate": 7.080652235621304e-05, + "loss": 0.0536, + "step": 71140 + }, + { + "epoch": 4.654890415439974, + "grad_norm": 0.7663356065750122, + "learning_rate": 7.079816918149764e-05, + "loss": 0.0598, + "step": 71150 + }, + { + "epoch": 4.655544651619235, + "grad_norm": 1.0369300842285156, + "learning_rate": 7.07898153047895e-05, + "loss": 0.0584, + "step": 71160 + }, + { + "epoch": 4.656198887798495, + "grad_norm": 0.8538703322410583, + "learning_rate": 7.078146072637055e-05, + "loss": 0.0608, + "step": 71170 + }, + { + "epoch": 4.656853123977756, + "grad_norm": 0.9272082448005676, + "learning_rate": 7.077310544652282e-05, + "loss": 0.0666, + "step": 71180 + }, + { + "epoch": 4.657507360157017, + "grad_norm": 0.8066675066947937, + "learning_rate": 7.076474946552828e-05, + "loss": 0.0551, + "step": 71190 + }, + { + "epoch": 4.658161596336277, + "grad_norm": 0.9657180905342102, + "learning_rate": 7.0756392783669e-05, + "loss": 0.067, + "step": 71200 + }, + { + "epoch": 4.658815832515538, + "grad_norm": 0.9008907675743103, + "learning_rate": 7.074803540122703e-05, + "loss": 0.0601, + "step": 71210 + }, + { + "epoch": 4.659470068694799, + "grad_norm": 0.7514058947563171, + "learning_rate": 7.073967731848445e-05, + "loss": 0.0573, + "step": 71220 + }, + { + "epoch": 4.66012430487406, + "grad_norm": 0.8084816336631775, + "learning_rate": 7.073131853572335e-05, + "loss": 0.0641, + "step": 71230 + }, + { + "epoch": 4.66077854105332, + "grad_norm": 0.7935675978660583, + "learning_rate": 7.072295905322592e-05, + "loss": 0.0567, + "step": 71240 + }, + { + "epoch": 4.661432777232581, + "grad_norm": 0.9092238545417786, + "learning_rate": 7.071459887127424e-05, + "loss": 0.0674, + "step": 71250 + }, + { + "epoch": 4.662087013411842, + "grad_norm": 0.9651187062263489, + "learning_rate": 7.070623799015052e-05, + "loss": 0.0626, + "step": 71260 + }, + { + "epoch": 4.662741249591102, + "grad_norm": 0.7449125051498413, + "learning_rate": 7.069787641013699e-05, + "loss": 0.0598, + "step": 71270 + }, + { + "epoch": 4.663395485770363, + "grad_norm": 0.8150971531867981, + "learning_rate": 7.068951413151583e-05, + "loss": 0.0704, + "step": 71280 + }, + { + "epoch": 4.664049721949624, + "grad_norm": 1.1093558073043823, + "learning_rate": 7.06811511545693e-05, + "loss": 0.0529, + "step": 71290 + }, + { + "epoch": 4.664703958128884, + "grad_norm": 0.9424728155136108, + "learning_rate": 7.06727874795797e-05, + "loss": 0.0607, + "step": 71300 + }, + { + "epoch": 4.665358194308145, + "grad_norm": 0.9972444772720337, + "learning_rate": 7.06644231068293e-05, + "loss": 0.0585, + "step": 71310 + }, + { + "epoch": 4.666012430487406, + "grad_norm": 0.8003326654434204, + "learning_rate": 7.065605803660042e-05, + "loss": 0.0623, + "step": 71320 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.8432341814041138, + "learning_rate": 7.064769226917543e-05, + "loss": 0.0526, + "step": 71330 + }, + { + "epoch": 4.667320902845927, + "grad_norm": 0.828727126121521, + "learning_rate": 7.063932580483665e-05, + "loss": 0.0643, + "step": 71340 + }, + { + "epoch": 4.667975139025188, + "grad_norm": 0.9466299414634705, + "learning_rate": 7.063095864386651e-05, + "loss": 0.057, + "step": 71350 + }, + { + "epoch": 4.668629375204449, + "grad_norm": 0.6953588128089905, + "learning_rate": 7.06225907865474e-05, + "loss": 0.0522, + "step": 71360 + }, + { + "epoch": 4.66928361138371, + "grad_norm": 0.8251481056213379, + "learning_rate": 7.061422223316176e-05, + "loss": 0.06, + "step": 71370 + }, + { + "epoch": 4.66993784756297, + "grad_norm": 1.085668683052063, + "learning_rate": 7.060585298399207e-05, + "loss": 0.0653, + "step": 71380 + }, + { + "epoch": 4.670592083742231, + "grad_norm": 1.0009715557098389, + "learning_rate": 7.05974830393208e-05, + "loss": 0.065, + "step": 71390 + }, + { + "epoch": 4.671246319921492, + "grad_norm": 0.8641042709350586, + "learning_rate": 7.058911239943046e-05, + "loss": 0.0589, + "step": 71400 + }, + { + "epoch": 4.671900556100752, + "grad_norm": 0.8176802396774292, + "learning_rate": 7.058074106460357e-05, + "loss": 0.058, + "step": 71410 + }, + { + "epoch": 4.672554792280013, + "grad_norm": 0.8100340366363525, + "learning_rate": 7.057236903512269e-05, + "loss": 0.0636, + "step": 71420 + }, + { + "epoch": 4.673209028459274, + "grad_norm": 0.9662539958953857, + "learning_rate": 7.056399631127043e-05, + "loss": 0.0568, + "step": 71430 + }, + { + "epoch": 4.673863264638534, + "grad_norm": 1.049534559249878, + "learning_rate": 7.055562289332934e-05, + "loss": 0.0701, + "step": 71440 + }, + { + "epoch": 4.674517500817795, + "grad_norm": 0.670726478099823, + "learning_rate": 7.054724878158206e-05, + "loss": 0.0551, + "step": 71450 + }, + { + "epoch": 4.675171736997056, + "grad_norm": 1.3592555522918701, + "learning_rate": 7.053887397631127e-05, + "loss": 0.0645, + "step": 71460 + }, + { + "epoch": 4.675825973176317, + "grad_norm": 0.7808588147163391, + "learning_rate": 7.053049847779961e-05, + "loss": 0.0649, + "step": 71470 + }, + { + "epoch": 4.676480209355577, + "grad_norm": 0.8169131875038147, + "learning_rate": 7.052212228632977e-05, + "loss": 0.0547, + "step": 71480 + }, + { + "epoch": 4.677134445534838, + "grad_norm": 0.8290903568267822, + "learning_rate": 7.051374540218451e-05, + "loss": 0.065, + "step": 71490 + }, + { + "epoch": 4.677788681714099, + "grad_norm": 1.1689528226852417, + "learning_rate": 7.050536782564653e-05, + "loss": 0.0707, + "step": 71500 + }, + { + "epoch": 4.67844291789336, + "grad_norm": 0.7976114749908447, + "learning_rate": 7.04969895569986e-05, + "loss": 0.0652, + "step": 71510 + }, + { + "epoch": 4.67909715407262, + "grad_norm": 0.9616780281066895, + "learning_rate": 7.04886105965235e-05, + "loss": 0.061, + "step": 71520 + }, + { + "epoch": 4.679751390251881, + "grad_norm": 0.9938467741012573, + "learning_rate": 7.048023094450411e-05, + "loss": 0.062, + "step": 71530 + }, + { + "epoch": 4.680405626431142, + "grad_norm": 0.8165993690490723, + "learning_rate": 7.047185060122317e-05, + "loss": 0.0564, + "step": 71540 + }, + { + "epoch": 4.681059862610402, + "grad_norm": 0.8592141270637512, + "learning_rate": 7.046346956696359e-05, + "loss": 0.0621, + "step": 71550 + }, + { + "epoch": 4.681714098789663, + "grad_norm": 0.957463800907135, + "learning_rate": 7.045508784200826e-05, + "loss": 0.0575, + "step": 71560 + }, + { + "epoch": 4.682368334968924, + "grad_norm": 0.7497161626815796, + "learning_rate": 7.044670542664007e-05, + "loss": 0.0598, + "step": 71570 + }, + { + "epoch": 4.683022571148184, + "grad_norm": 0.8132786750793457, + "learning_rate": 7.043832232114194e-05, + "loss": 0.0562, + "step": 71580 + }, + { + "epoch": 4.683676807327445, + "grad_norm": 0.8646707534790039, + "learning_rate": 7.042993852579683e-05, + "loss": 0.056, + "step": 71590 + }, + { + "epoch": 4.684331043506706, + "grad_norm": 0.82687908411026, + "learning_rate": 7.042155404088772e-05, + "loss": 0.0574, + "step": 71600 + }, + { + "epoch": 4.684985279685967, + "grad_norm": 0.9083144664764404, + "learning_rate": 7.04131688666976e-05, + "loss": 0.0564, + "step": 71610 + }, + { + "epoch": 4.685639515865227, + "grad_norm": 0.7439766526222229, + "learning_rate": 7.04047830035095e-05, + "loss": 0.06, + "step": 71620 + }, + { + "epoch": 4.686293752044488, + "grad_norm": 0.8073269128799438, + "learning_rate": 7.039639645160646e-05, + "loss": 0.064, + "step": 71630 + }, + { + "epoch": 4.686947988223749, + "grad_norm": 1.0741745233535767, + "learning_rate": 7.038800921127152e-05, + "loss": 0.0638, + "step": 71640 + }, + { + "epoch": 4.687602224403009, + "grad_norm": 0.9227171540260315, + "learning_rate": 7.037962128278783e-05, + "loss": 0.0577, + "step": 71650 + }, + { + "epoch": 4.68825646058227, + "grad_norm": 1.138973355293274, + "learning_rate": 7.037123266643846e-05, + "loss": 0.0701, + "step": 71660 + }, + { + "epoch": 4.688910696761531, + "grad_norm": 1.039119839668274, + "learning_rate": 7.036284336250658e-05, + "loss": 0.0623, + "step": 71670 + }, + { + "epoch": 4.689564932940792, + "grad_norm": 1.0590921640396118, + "learning_rate": 7.035445337127532e-05, + "loss": 0.0638, + "step": 71680 + }, + { + "epoch": 4.690219169120052, + "grad_norm": 0.8854411244392395, + "learning_rate": 7.034606269302789e-05, + "loss": 0.0663, + "step": 71690 + }, + { + "epoch": 4.690873405299313, + "grad_norm": 0.7692389488220215, + "learning_rate": 7.033767132804747e-05, + "loss": 0.0529, + "step": 71700 + }, + { + "epoch": 4.691527641478574, + "grad_norm": 0.6598808765411377, + "learning_rate": 7.03292792766173e-05, + "loss": 0.0584, + "step": 71710 + }, + { + "epoch": 4.692181877657834, + "grad_norm": 0.8510991334915161, + "learning_rate": 7.032088653902067e-05, + "loss": 0.0621, + "step": 71720 + }, + { + "epoch": 4.692836113837095, + "grad_norm": 0.9420493245124817, + "learning_rate": 7.031249311554079e-05, + "loss": 0.0635, + "step": 71730 + }, + { + "epoch": 4.693490350016356, + "grad_norm": 0.8198224306106567, + "learning_rate": 7.0304099006461e-05, + "loss": 0.0673, + "step": 71740 + }, + { + "epoch": 4.694144586195616, + "grad_norm": 1.035709023475647, + "learning_rate": 7.029570421206464e-05, + "loss": 0.064, + "step": 71750 + }, + { + "epoch": 4.694798822374877, + "grad_norm": 0.8997196555137634, + "learning_rate": 7.028730873263502e-05, + "loss": 0.0555, + "step": 71760 + }, + { + "epoch": 4.695453058554138, + "grad_norm": 0.9662396907806396, + "learning_rate": 7.027891256845553e-05, + "loss": 0.0634, + "step": 71770 + }, + { + "epoch": 4.696107294733399, + "grad_norm": 0.7257537245750427, + "learning_rate": 7.027051571980957e-05, + "loss": 0.0614, + "step": 71780 + }, + { + "epoch": 4.696761530912659, + "grad_norm": 0.9215990900993347, + "learning_rate": 7.026211818698053e-05, + "loss": 0.0698, + "step": 71790 + }, + { + "epoch": 4.69741576709192, + "grad_norm": 0.7259654402732849, + "learning_rate": 7.025371997025185e-05, + "loss": 0.0678, + "step": 71800 + }, + { + "epoch": 4.698070003271181, + "grad_norm": 0.9841349124908447, + "learning_rate": 7.0245321069907e-05, + "loss": 0.0576, + "step": 71810 + }, + { + "epoch": 4.698724239450442, + "grad_norm": 0.9428102374076843, + "learning_rate": 7.02369214862295e-05, + "loss": 0.0601, + "step": 71820 + }, + { + "epoch": 4.699378475629702, + "grad_norm": 0.7501752376556396, + "learning_rate": 7.022852121950281e-05, + "loss": 0.0564, + "step": 71830 + }, + { + "epoch": 4.700032711808963, + "grad_norm": 0.705059289932251, + "learning_rate": 7.022012027001048e-05, + "loss": 0.0499, + "step": 71840 + }, + { + "epoch": 4.700686947988224, + "grad_norm": 0.7624207735061646, + "learning_rate": 7.021171863803606e-05, + "loss": 0.072, + "step": 71850 + }, + { + "epoch": 4.701341184167484, + "grad_norm": 0.7773285508155823, + "learning_rate": 7.020331632386312e-05, + "loss": 0.0545, + "step": 71860 + }, + { + "epoch": 4.701995420346745, + "grad_norm": 1.0297824144363403, + "learning_rate": 7.019491332777528e-05, + "loss": 0.0549, + "step": 71870 + }, + { + "epoch": 4.702649656526006, + "grad_norm": 0.9911094307899475, + "learning_rate": 7.018650965005616e-05, + "loss": 0.0671, + "step": 71880 + }, + { + "epoch": 4.703303892705266, + "grad_norm": 0.960195779800415, + "learning_rate": 7.017810529098938e-05, + "loss": 0.0664, + "step": 71890 + }, + { + "epoch": 4.703958128884527, + "grad_norm": 0.7779977321624756, + "learning_rate": 7.016970025085864e-05, + "loss": 0.0593, + "step": 71900 + }, + { + "epoch": 4.704612365063788, + "grad_norm": 0.8631134629249573, + "learning_rate": 7.016129452994761e-05, + "loss": 0.06, + "step": 71910 + }, + { + "epoch": 4.705266601243049, + "grad_norm": 0.854272186756134, + "learning_rate": 7.015288812854003e-05, + "loss": 0.0593, + "step": 71920 + }, + { + "epoch": 4.705920837422309, + "grad_norm": 1.105404257774353, + "learning_rate": 7.01444810469196e-05, + "loss": 0.0675, + "step": 71930 + }, + { + "epoch": 4.70657507360157, + "grad_norm": 0.8611233234405518, + "learning_rate": 7.013607328537012e-05, + "loss": 0.0567, + "step": 71940 + }, + { + "epoch": 4.707229309780831, + "grad_norm": 0.8065164089202881, + "learning_rate": 7.012766484417536e-05, + "loss": 0.065, + "step": 71950 + }, + { + "epoch": 4.707883545960092, + "grad_norm": 0.853921115398407, + "learning_rate": 7.011925572361912e-05, + "loss": 0.0626, + "step": 71960 + }, + { + "epoch": 4.708537782139352, + "grad_norm": 0.850261926651001, + "learning_rate": 7.011084592398523e-05, + "loss": 0.0757, + "step": 71970 + }, + { + "epoch": 4.709192018318613, + "grad_norm": 1.0401116609573364, + "learning_rate": 7.010243544555756e-05, + "loss": 0.0597, + "step": 71980 + }, + { + "epoch": 4.709846254497874, + "grad_norm": 1.0012506246566772, + "learning_rate": 7.009402428861995e-05, + "loss": 0.0637, + "step": 71990 + }, + { + "epoch": 4.710500490677134, + "grad_norm": 0.854714572429657, + "learning_rate": 7.008561245345634e-05, + "loss": 0.0605, + "step": 72000 + }, + { + "epoch": 4.711154726856395, + "grad_norm": 1.0572654008865356, + "learning_rate": 7.007719994035063e-05, + "loss": 0.0718, + "step": 72010 + }, + { + "epoch": 4.711808963035656, + "grad_norm": 0.9308390021324158, + "learning_rate": 7.006878674958676e-05, + "loss": 0.0705, + "step": 72020 + }, + { + "epoch": 4.712463199214916, + "grad_norm": 0.945894181728363, + "learning_rate": 7.006037288144872e-05, + "loss": 0.0676, + "step": 72030 + }, + { + "epoch": 4.713117435394177, + "grad_norm": 1.6244274377822876, + "learning_rate": 7.005195833622048e-05, + "loss": 0.059, + "step": 72040 + }, + { + "epoch": 4.713771671573438, + "grad_norm": 0.9291044473648071, + "learning_rate": 7.004354311418606e-05, + "loss": 0.0635, + "step": 72050 + }, + { + "epoch": 4.714425907752699, + "grad_norm": 0.9611480832099915, + "learning_rate": 7.003512721562949e-05, + "loss": 0.0566, + "step": 72060 + }, + { + "epoch": 4.715080143931959, + "grad_norm": 0.7825486660003662, + "learning_rate": 7.002671064083482e-05, + "loss": 0.0575, + "step": 72070 + }, + { + "epoch": 4.71573438011122, + "grad_norm": 0.8755865693092346, + "learning_rate": 7.001829339008616e-05, + "loss": 0.065, + "step": 72080 + }, + { + "epoch": 4.716388616290481, + "grad_norm": 0.8769670724868774, + "learning_rate": 7.000987546366758e-05, + "loss": 0.068, + "step": 72090 + }, + { + "epoch": 4.717042852469741, + "grad_norm": 0.7498029470443726, + "learning_rate": 7.000145686186324e-05, + "loss": 0.0537, + "step": 72100 + }, + { + "epoch": 4.717697088649002, + "grad_norm": 1.0412774085998535, + "learning_rate": 6.999303758495727e-05, + "loss": 0.0519, + "step": 72110 + }, + { + "epoch": 4.718351324828263, + "grad_norm": 0.8896167278289795, + "learning_rate": 6.998461763323385e-05, + "loss": 0.0576, + "step": 72120 + }, + { + "epoch": 4.719005561007524, + "grad_norm": 1.0666289329528809, + "learning_rate": 6.997619700697719e-05, + "loss": 0.0625, + "step": 72130 + }, + { + "epoch": 4.719659797186784, + "grad_norm": 1.330458164215088, + "learning_rate": 6.996777570647147e-05, + "loss": 0.07, + "step": 72140 + }, + { + "epoch": 4.720314033366045, + "grad_norm": 1.0409564971923828, + "learning_rate": 6.995935373200095e-05, + "loss": 0.0543, + "step": 72150 + }, + { + "epoch": 4.720968269545306, + "grad_norm": 0.8398634195327759, + "learning_rate": 6.995093108384992e-05, + "loss": 0.0582, + "step": 72160 + }, + { + "epoch": 4.721622505724566, + "grad_norm": 0.9149483442306519, + "learning_rate": 6.994250776230262e-05, + "loss": 0.0605, + "step": 72170 + }, + { + "epoch": 4.722276741903827, + "grad_norm": 0.8937086462974548, + "learning_rate": 6.993408376764339e-05, + "loss": 0.0579, + "step": 72180 + }, + { + "epoch": 4.722930978083088, + "grad_norm": 0.8356661796569824, + "learning_rate": 6.992565910015655e-05, + "loss": 0.0569, + "step": 72190 + }, + { + "epoch": 4.723585214262348, + "grad_norm": 1.0383862257003784, + "learning_rate": 6.991723376012646e-05, + "loss": 0.0665, + "step": 72200 + }, + { + "epoch": 4.724239450441609, + "grad_norm": 0.9304238557815552, + "learning_rate": 6.99088077478375e-05, + "loss": 0.054, + "step": 72210 + }, + { + "epoch": 4.72489368662087, + "grad_norm": 1.0787382125854492, + "learning_rate": 6.990038106357407e-05, + "loss": 0.059, + "step": 72220 + }, + { + "epoch": 4.725547922800131, + "grad_norm": 0.9750184416770935, + "learning_rate": 6.989195370762057e-05, + "loss": 0.0621, + "step": 72230 + }, + { + "epoch": 4.726202158979391, + "grad_norm": 0.9108086824417114, + "learning_rate": 6.988352568026148e-05, + "loss": 0.0576, + "step": 72240 + }, + { + "epoch": 4.726856395158652, + "grad_norm": 1.0215849876403809, + "learning_rate": 6.987509698178125e-05, + "loss": 0.0619, + "step": 72250 + }, + { + "epoch": 4.727510631337913, + "grad_norm": 1.3526380062103271, + "learning_rate": 6.986666761246436e-05, + "loss": 0.0573, + "step": 72260 + }, + { + "epoch": 4.728164867517174, + "grad_norm": 0.9454095363616943, + "learning_rate": 6.985823757259535e-05, + "loss": 0.0601, + "step": 72270 + }, + { + "epoch": 4.728819103696434, + "grad_norm": 1.032045841217041, + "learning_rate": 6.984980686245874e-05, + "loss": 0.0577, + "step": 72280 + }, + { + "epoch": 4.729473339875695, + "grad_norm": 0.8855615854263306, + "learning_rate": 6.98413754823391e-05, + "loss": 0.0683, + "step": 72290 + }, + { + "epoch": 4.730127576054956, + "grad_norm": 0.9899473786354065, + "learning_rate": 6.983294343252098e-05, + "loss": 0.0705, + "step": 72300 + }, + { + "epoch": 4.730781812234216, + "grad_norm": 0.9037162065505981, + "learning_rate": 6.982451071328902e-05, + "loss": 0.0659, + "step": 72310 + }, + { + "epoch": 4.731436048413477, + "grad_norm": 0.9520037174224854, + "learning_rate": 6.98160773249278e-05, + "loss": 0.0558, + "step": 72320 + }, + { + "epoch": 4.732090284592738, + "grad_norm": 0.6964289546012878, + "learning_rate": 6.980764326772204e-05, + "loss": 0.054, + "step": 72330 + }, + { + "epoch": 4.732744520771998, + "grad_norm": 1.047905683517456, + "learning_rate": 6.979920854195637e-05, + "loss": 0.0623, + "step": 72340 + }, + { + "epoch": 4.733398756951259, + "grad_norm": 1.0188045501708984, + "learning_rate": 6.979077314791546e-05, + "loss": 0.0634, + "step": 72350 + }, + { + "epoch": 4.73405299313052, + "grad_norm": 0.7242242097854614, + "learning_rate": 6.978233708588407e-05, + "loss": 0.0592, + "step": 72360 + }, + { + "epoch": 4.734707229309781, + "grad_norm": 1.0020005702972412, + "learning_rate": 6.977390035614692e-05, + "loss": 0.0608, + "step": 72370 + }, + { + "epoch": 4.735361465489041, + "grad_norm": 0.9756394028663635, + "learning_rate": 6.976546295898878e-05, + "loss": 0.0702, + "step": 72380 + }, + { + "epoch": 4.736015701668302, + "grad_norm": 0.9397419095039368, + "learning_rate": 6.975702489469442e-05, + "loss": 0.0573, + "step": 72390 + }, + { + "epoch": 4.736669937847563, + "grad_norm": 0.9236851334571838, + "learning_rate": 6.974858616354867e-05, + "loss": 0.0593, + "step": 72400 + }, + { + "epoch": 4.737324174026824, + "grad_norm": 0.7359132170677185, + "learning_rate": 6.974014676583632e-05, + "loss": 0.0565, + "step": 72410 + }, + { + "epoch": 4.737978410206084, + "grad_norm": 1.030502438545227, + "learning_rate": 6.973170670184226e-05, + "loss": 0.0568, + "step": 72420 + }, + { + "epoch": 4.738632646385345, + "grad_norm": 0.7706143856048584, + "learning_rate": 6.972326597185136e-05, + "loss": 0.0571, + "step": 72430 + }, + { + "epoch": 4.739286882564606, + "grad_norm": 0.8049507737159729, + "learning_rate": 6.971482457614848e-05, + "loss": 0.0619, + "step": 72440 + }, + { + "epoch": 4.739941118743866, + "grad_norm": 0.9160942435264587, + "learning_rate": 6.970638251501859e-05, + "loss": 0.0699, + "step": 72450 + }, + { + "epoch": 4.740595354923127, + "grad_norm": 0.8005819916725159, + "learning_rate": 6.96979397887466e-05, + "loss": 0.0572, + "step": 72460 + }, + { + "epoch": 4.741249591102388, + "grad_norm": 0.9010353088378906, + "learning_rate": 6.96894963976175e-05, + "loss": 0.06, + "step": 72470 + }, + { + "epoch": 4.741903827281648, + "grad_norm": 0.8157947659492493, + "learning_rate": 6.968105234191623e-05, + "loss": 0.0609, + "step": 72480 + }, + { + "epoch": 4.742558063460909, + "grad_norm": 0.968302309513092, + "learning_rate": 6.967260762192785e-05, + "loss": 0.0542, + "step": 72490 + }, + { + "epoch": 4.74321229964017, + "grad_norm": 1.0909911394119263, + "learning_rate": 6.966416223793736e-05, + "loss": 0.0624, + "step": 72500 + }, + { + "epoch": 4.743866535819431, + "grad_norm": 0.7629793286323547, + "learning_rate": 6.965571619022981e-05, + "loss": 0.0592, + "step": 72510 + }, + { + "epoch": 4.744520771998691, + "grad_norm": 0.7732744216918945, + "learning_rate": 6.964726947909031e-05, + "loss": 0.0632, + "step": 72520 + }, + { + "epoch": 4.745175008177952, + "grad_norm": 0.8113254904747009, + "learning_rate": 6.963882210480394e-05, + "loss": 0.0632, + "step": 72530 + }, + { + "epoch": 4.745829244357213, + "grad_norm": 1.129671573638916, + "learning_rate": 6.963037406765581e-05, + "loss": 0.0577, + "step": 72540 + }, + { + "epoch": 4.746483480536473, + "grad_norm": 0.8630444407463074, + "learning_rate": 6.962192536793106e-05, + "loss": 0.0709, + "step": 72550 + }, + { + "epoch": 4.747137716715734, + "grad_norm": 0.8324365019798279, + "learning_rate": 6.961347600591489e-05, + "loss": 0.057, + "step": 72560 + }, + { + "epoch": 4.747791952894995, + "grad_norm": 0.919032633304596, + "learning_rate": 6.960502598189245e-05, + "loss": 0.0581, + "step": 72570 + }, + { + "epoch": 4.748446189074256, + "grad_norm": 1.0946227312088013, + "learning_rate": 6.959657529614898e-05, + "loss": 0.0643, + "step": 72580 + }, + { + "epoch": 4.749100425253516, + "grad_norm": 0.8766310214996338, + "learning_rate": 6.95881239489697e-05, + "loss": 0.0577, + "step": 72590 + }, + { + "epoch": 4.749754661432777, + "grad_norm": 0.9453266859054565, + "learning_rate": 6.957967194063985e-05, + "loss": 0.0631, + "step": 72600 + }, + { + "epoch": 4.750408897612038, + "grad_norm": 0.74550861120224, + "learning_rate": 6.957121927144474e-05, + "loss": 0.0627, + "step": 72610 + }, + { + "epoch": 4.751063133791298, + "grad_norm": 0.7826007008552551, + "learning_rate": 6.956276594166964e-05, + "loss": 0.0572, + "step": 72620 + }, + { + "epoch": 4.751717369970559, + "grad_norm": 0.9839434027671814, + "learning_rate": 6.955431195159989e-05, + "loss": 0.0724, + "step": 72630 + }, + { + "epoch": 4.75237160614982, + "grad_norm": 0.808029294013977, + "learning_rate": 6.954585730152083e-05, + "loss": 0.0634, + "step": 72640 + }, + { + "epoch": 4.75302584232908, + "grad_norm": 0.935702383518219, + "learning_rate": 6.953740199171782e-05, + "loss": 0.0734, + "step": 72650 + }, + { + "epoch": 4.753680078508341, + "grad_norm": 1.2413403987884521, + "learning_rate": 6.952894602247626e-05, + "loss": 0.0655, + "step": 72660 + }, + { + "epoch": 4.754334314687602, + "grad_norm": 1.0404640436172485, + "learning_rate": 6.952048939408156e-05, + "loss": 0.0731, + "step": 72670 + }, + { + "epoch": 4.754988550866863, + "grad_norm": 0.900425136089325, + "learning_rate": 6.951203210681914e-05, + "loss": 0.0559, + "step": 72680 + }, + { + "epoch": 4.755642787046123, + "grad_norm": 0.9717649817466736, + "learning_rate": 6.950357416097446e-05, + "loss": 0.0594, + "step": 72690 + }, + { + "epoch": 4.756297023225384, + "grad_norm": 1.1071817874908447, + "learning_rate": 6.949511555683301e-05, + "loss": 0.0638, + "step": 72700 + }, + { + "epoch": 4.756951259404645, + "grad_norm": 0.9194473028182983, + "learning_rate": 6.948665629468027e-05, + "loss": 0.0655, + "step": 72710 + }, + { + "epoch": 4.757605495583906, + "grad_norm": 0.7610183358192444, + "learning_rate": 6.94781963748018e-05, + "loss": 0.0632, + "step": 72720 + }, + { + "epoch": 4.758259731763166, + "grad_norm": 0.8666313886642456, + "learning_rate": 6.946973579748309e-05, + "loss": 0.0659, + "step": 72730 + }, + { + "epoch": 4.758913967942427, + "grad_norm": 0.8880037665367126, + "learning_rate": 6.946127456300974e-05, + "loss": 0.062, + "step": 72740 + }, + { + "epoch": 4.759568204121688, + "grad_norm": 0.8450115323066711, + "learning_rate": 6.945281267166736e-05, + "loss": 0.072, + "step": 72750 + }, + { + "epoch": 4.760222440300948, + "grad_norm": 1.100077509880066, + "learning_rate": 6.94443501237415e-05, + "loss": 0.0661, + "step": 72760 + }, + { + "epoch": 4.760876676480209, + "grad_norm": 0.956047534942627, + "learning_rate": 6.943588691951785e-05, + "loss": 0.0607, + "step": 72770 + }, + { + "epoch": 4.76153091265947, + "grad_norm": 0.7662896513938904, + "learning_rate": 6.942742305928205e-05, + "loss": 0.0627, + "step": 72780 + }, + { + "epoch": 4.76218514883873, + "grad_norm": 0.9403966069221497, + "learning_rate": 6.941895854331977e-05, + "loss": 0.0646, + "step": 72790 + }, + { + "epoch": 4.762839385017991, + "grad_norm": 0.918593168258667, + "learning_rate": 6.94104933719167e-05, + "loss": 0.0521, + "step": 72800 + }, + { + "epoch": 4.763493621197252, + "grad_norm": 0.8933160901069641, + "learning_rate": 6.940202754535856e-05, + "loss": 0.0629, + "step": 72810 + }, + { + "epoch": 4.764147857376513, + "grad_norm": 0.8830978274345398, + "learning_rate": 6.939356106393113e-05, + "loss": 0.059, + "step": 72820 + }, + { + "epoch": 4.764802093555773, + "grad_norm": 0.9082246422767639, + "learning_rate": 6.938509392792016e-05, + "loss": 0.057, + "step": 72830 + }, + { + "epoch": 4.765456329735034, + "grad_norm": 1.0356862545013428, + "learning_rate": 6.93766261376114e-05, + "loss": 0.0589, + "step": 72840 + }, + { + "epoch": 4.766110565914295, + "grad_norm": 0.8933234810829163, + "learning_rate": 6.936815769329071e-05, + "loss": 0.0596, + "step": 72850 + }, + { + "epoch": 4.766764802093556, + "grad_norm": 0.8427624106407166, + "learning_rate": 6.935968859524389e-05, + "loss": 0.0563, + "step": 72860 + }, + { + "epoch": 4.767419038272816, + "grad_norm": 0.8006119728088379, + "learning_rate": 6.935121884375683e-05, + "loss": 0.0591, + "step": 72870 + }, + { + "epoch": 4.768073274452077, + "grad_norm": 0.7767458558082581, + "learning_rate": 6.934274843911537e-05, + "loss": 0.0603, + "step": 72880 + }, + { + "epoch": 4.768727510631338, + "grad_norm": 0.8365658521652222, + "learning_rate": 6.933427738160542e-05, + "loss": 0.0648, + "step": 72890 + }, + { + "epoch": 4.769381746810598, + "grad_norm": 0.9282712340354919, + "learning_rate": 6.93258056715129e-05, + "loss": 0.0722, + "step": 72900 + }, + { + "epoch": 4.770035982989859, + "grad_norm": 0.7623592615127563, + "learning_rate": 6.931733330912375e-05, + "loss": 0.0618, + "step": 72910 + }, + { + "epoch": 4.77069021916912, + "grad_norm": 0.8280691504478455, + "learning_rate": 6.930886029472396e-05, + "loss": 0.0585, + "step": 72920 + }, + { + "epoch": 4.77134445534838, + "grad_norm": 0.8656930923461914, + "learning_rate": 6.930038662859947e-05, + "loss": 0.06, + "step": 72930 + }, + { + "epoch": 4.771998691527641, + "grad_norm": 0.8957453370094299, + "learning_rate": 6.929191231103634e-05, + "loss": 0.0626, + "step": 72940 + }, + { + "epoch": 4.772652927706902, + "grad_norm": 0.8083595037460327, + "learning_rate": 6.928343734232057e-05, + "loss": 0.0679, + "step": 72950 + }, + { + "epoch": 4.773307163886163, + "grad_norm": 0.9139804244041443, + "learning_rate": 6.92749617227382e-05, + "loss": 0.0612, + "step": 72960 + }, + { + "epoch": 4.773961400065423, + "grad_norm": 0.9249205589294434, + "learning_rate": 6.926648545257534e-05, + "loss": 0.0599, + "step": 72970 + }, + { + "epoch": 4.774615636244684, + "grad_norm": 0.9592366218566895, + "learning_rate": 6.925800853211807e-05, + "loss": 0.054, + "step": 72980 + }, + { + "epoch": 4.775269872423945, + "grad_norm": 0.9399287700653076, + "learning_rate": 6.924953096165248e-05, + "loss": 0.0585, + "step": 72990 + }, + { + "epoch": 4.775924108603205, + "grad_norm": 0.8287541270256042, + "learning_rate": 6.924105274146476e-05, + "loss": 0.0568, + "step": 73000 + }, + { + "epoch": 4.776578344782466, + "grad_norm": 0.6379443407058716, + "learning_rate": 6.923257387184103e-05, + "loss": 0.0574, + "step": 73010 + }, + { + "epoch": 4.777232580961727, + "grad_norm": 0.7155212163925171, + "learning_rate": 6.922409435306751e-05, + "loss": 0.0646, + "step": 73020 + }, + { + "epoch": 4.777886817140988, + "grad_norm": 0.8226625919342041, + "learning_rate": 6.921561418543037e-05, + "loss": 0.0576, + "step": 73030 + }, + { + "epoch": 4.778541053320248, + "grad_norm": 0.8837935328483582, + "learning_rate": 6.920713336921588e-05, + "loss": 0.0607, + "step": 73040 + }, + { + "epoch": 4.779195289499509, + "grad_norm": 0.8143121004104614, + "learning_rate": 6.919865190471026e-05, + "loss": 0.0559, + "step": 73050 + }, + { + "epoch": 4.77984952567877, + "grad_norm": 0.7172751426696777, + "learning_rate": 6.919016979219978e-05, + "loss": 0.052, + "step": 73060 + }, + { + "epoch": 4.78050376185803, + "grad_norm": 0.9022568464279175, + "learning_rate": 6.918168703197073e-05, + "loss": 0.0589, + "step": 73070 + }, + { + "epoch": 4.781157998037291, + "grad_norm": 0.9577611684799194, + "learning_rate": 6.917320362430945e-05, + "loss": 0.0654, + "step": 73080 + }, + { + "epoch": 4.781812234216552, + "grad_norm": 0.7758936285972595, + "learning_rate": 6.916471956950228e-05, + "loss": 0.0634, + "step": 73090 + }, + { + "epoch": 4.782466470395812, + "grad_norm": 0.9259306192398071, + "learning_rate": 6.915623486783555e-05, + "loss": 0.064, + "step": 73100 + }, + { + "epoch": 4.783120706575073, + "grad_norm": 0.9896383285522461, + "learning_rate": 6.914774951959565e-05, + "loss": 0.0581, + "step": 73110 + }, + { + "epoch": 4.783774942754334, + "grad_norm": 0.8976383209228516, + "learning_rate": 6.913926352506898e-05, + "loss": 0.0583, + "step": 73120 + }, + { + "epoch": 4.784429178933595, + "grad_norm": 0.9846516251564026, + "learning_rate": 6.913077688454198e-05, + "loss": 0.0639, + "step": 73130 + }, + { + "epoch": 4.785083415112855, + "grad_norm": 0.7982523441314697, + "learning_rate": 6.912228959830109e-05, + "loss": 0.062, + "step": 73140 + }, + { + "epoch": 4.785737651292116, + "grad_norm": 0.7241307497024536, + "learning_rate": 6.911380166663278e-05, + "loss": 0.0607, + "step": 73150 + }, + { + "epoch": 4.786391887471377, + "grad_norm": 0.7322345972061157, + "learning_rate": 6.910531308982353e-05, + "loss": 0.0645, + "step": 73160 + }, + { + "epoch": 4.787046123650638, + "grad_norm": 0.8883731961250305, + "learning_rate": 6.909682386815987e-05, + "loss": 0.0642, + "step": 73170 + }, + { + "epoch": 4.787700359829898, + "grad_norm": 0.8451051115989685, + "learning_rate": 6.908833400192829e-05, + "loss": 0.0635, + "step": 73180 + }, + { + "epoch": 4.788354596009159, + "grad_norm": 1.008467435836792, + "learning_rate": 6.90798434914154e-05, + "loss": 0.0643, + "step": 73190 + }, + { + "epoch": 4.78900883218842, + "grad_norm": 0.964712381362915, + "learning_rate": 6.907135233690774e-05, + "loss": 0.0602, + "step": 73200 + }, + { + "epoch": 4.78966306836768, + "grad_norm": 0.9574010968208313, + "learning_rate": 6.906286053869194e-05, + "loss": 0.0586, + "step": 73210 + }, + { + "epoch": 4.790317304546941, + "grad_norm": 0.9692860245704651, + "learning_rate": 6.90543680970546e-05, + "loss": 0.0646, + "step": 73220 + }, + { + "epoch": 4.790971540726202, + "grad_norm": 0.8585093021392822, + "learning_rate": 6.904587501228236e-05, + "loss": 0.0654, + "step": 73230 + }, + { + "epoch": 4.791625776905462, + "grad_norm": 0.9351073503494263, + "learning_rate": 6.903738128466188e-05, + "loss": 0.0553, + "step": 73240 + }, + { + "epoch": 4.792280013084723, + "grad_norm": 0.7714135050773621, + "learning_rate": 6.902888691447986e-05, + "loss": 0.0557, + "step": 73250 + }, + { + "epoch": 4.792934249263984, + "grad_norm": 0.977641224861145, + "learning_rate": 6.9020391902023e-05, + "loss": 0.0522, + "step": 73260 + }, + { + "epoch": 4.793588485443245, + "grad_norm": 1.0779730081558228, + "learning_rate": 6.901189624757803e-05, + "loss": 0.0606, + "step": 73270 + }, + { + "epoch": 4.794242721622505, + "grad_norm": 0.9080124497413635, + "learning_rate": 6.900339995143172e-05, + "loss": 0.0553, + "step": 73280 + }, + { + "epoch": 4.794896957801766, + "grad_norm": 0.8090987801551819, + "learning_rate": 6.899490301387079e-05, + "loss": 0.0565, + "step": 73290 + }, + { + "epoch": 4.795551193981027, + "grad_norm": 1.0443229675292969, + "learning_rate": 6.89864054351821e-05, + "loss": 0.0588, + "step": 73300 + }, + { + "epoch": 4.796205430160288, + "grad_norm": 0.7326402068138123, + "learning_rate": 6.897790721565243e-05, + "loss": 0.0569, + "step": 73310 + }, + { + "epoch": 4.796859666339548, + "grad_norm": 0.8527650237083435, + "learning_rate": 6.89694083555686e-05, + "loss": 0.0687, + "step": 73320 + }, + { + "epoch": 4.797513902518809, + "grad_norm": 1.0699238777160645, + "learning_rate": 6.896090885521749e-05, + "loss": 0.0616, + "step": 73330 + }, + { + "epoch": 4.79816813869807, + "grad_norm": 0.9271957874298096, + "learning_rate": 6.895240871488599e-05, + "loss": 0.062, + "step": 73340 + }, + { + "epoch": 4.79882237487733, + "grad_norm": 0.6596797704696655, + "learning_rate": 6.894390793486098e-05, + "loss": 0.0538, + "step": 73350 + }, + { + "epoch": 4.799476611056591, + "grad_norm": 0.8873812556266785, + "learning_rate": 6.89354065154294e-05, + "loss": 0.0678, + "step": 73360 + }, + { + "epoch": 4.800130847235852, + "grad_norm": 1.0261344909667969, + "learning_rate": 6.89269044568782e-05, + "loss": 0.0636, + "step": 73370 + }, + { + "epoch": 4.8007850834151125, + "grad_norm": 0.9206944108009338, + "learning_rate": 6.891840175949432e-05, + "loss": 0.0535, + "step": 73380 + }, + { + "epoch": 4.801439319594373, + "grad_norm": 0.9742897152900696, + "learning_rate": 6.890989842356479e-05, + "loss": 0.0599, + "step": 73390 + }, + { + "epoch": 4.802093555773634, + "grad_norm": 0.8803852796554565, + "learning_rate": 6.890139444937657e-05, + "loss": 0.066, + "step": 73400 + }, + { + "epoch": 4.802747791952895, + "grad_norm": 1.0795328617095947, + "learning_rate": 6.889288983721673e-05, + "loss": 0.0587, + "step": 73410 + }, + { + "epoch": 4.803402028132155, + "grad_norm": 0.8851546049118042, + "learning_rate": 6.888438458737232e-05, + "loss": 0.0598, + "step": 73420 + }, + { + "epoch": 4.804056264311416, + "grad_norm": 0.9637795686721802, + "learning_rate": 6.887587870013039e-05, + "loss": 0.0534, + "step": 73430 + }, + { + "epoch": 4.804710500490677, + "grad_norm": 1.036117434501648, + "learning_rate": 6.886737217577805e-05, + "loss": 0.0589, + "step": 73440 + }, + { + "epoch": 4.8053647366699375, + "grad_norm": 0.8892022967338562, + "learning_rate": 6.88588650146024e-05, + "loss": 0.058, + "step": 73450 + }, + { + "epoch": 4.806018972849198, + "grad_norm": 0.7642195224761963, + "learning_rate": 6.885035721689062e-05, + "loss": 0.0707, + "step": 73460 + }, + { + "epoch": 4.806673209028459, + "grad_norm": 0.8952467441558838, + "learning_rate": 6.884184878292985e-05, + "loss": 0.0618, + "step": 73470 + }, + { + "epoch": 4.80732744520772, + "grad_norm": 0.7745544910430908, + "learning_rate": 6.883333971300725e-05, + "loss": 0.0601, + "step": 73480 + }, + { + "epoch": 4.8079816813869805, + "grad_norm": 0.8764887452125549, + "learning_rate": 6.882483000741007e-05, + "loss": 0.0614, + "step": 73490 + }, + { + "epoch": 4.808635917566241, + "grad_norm": 0.8772285580635071, + "learning_rate": 6.881631966642549e-05, + "loss": 0.0532, + "step": 73500 + }, + { + "epoch": 4.809290153745502, + "grad_norm": 1.0648410320281982, + "learning_rate": 6.880780869034077e-05, + "loss": 0.0655, + "step": 73510 + }, + { + "epoch": 4.8099443899247625, + "grad_norm": 1.125880479812622, + "learning_rate": 6.879929707944317e-05, + "loss": 0.0671, + "step": 73520 + }, + { + "epoch": 4.8105986261040234, + "grad_norm": 0.867889404296875, + "learning_rate": 6.879078483402002e-05, + "loss": 0.0512, + "step": 73530 + }, + { + "epoch": 4.811252862283284, + "grad_norm": 0.8720964789390564, + "learning_rate": 6.878227195435859e-05, + "loss": 0.0601, + "step": 73540 + }, + { + "epoch": 4.8119070984625445, + "grad_norm": 0.8965845108032227, + "learning_rate": 6.877375844074622e-05, + "loss": 0.0582, + "step": 73550 + }, + { + "epoch": 4.8125613346418055, + "grad_norm": 0.7062650322914124, + "learning_rate": 6.876524429347027e-05, + "loss": 0.0674, + "step": 73560 + }, + { + "epoch": 4.813215570821066, + "grad_norm": 1.128409743309021, + "learning_rate": 6.87567295128181e-05, + "loss": 0.0603, + "step": 73570 + }, + { + "epoch": 4.813869807000327, + "grad_norm": 0.952961802482605, + "learning_rate": 6.874821409907713e-05, + "loss": 0.0563, + "step": 73580 + }, + { + "epoch": 4.8145240431795875, + "grad_norm": 0.8573753833770752, + "learning_rate": 6.873969805253477e-05, + "loss": 0.06, + "step": 73590 + }, + { + "epoch": 4.8151782793588485, + "grad_norm": 0.8490943312644958, + "learning_rate": 6.873118137347844e-05, + "loss": 0.0584, + "step": 73600 + }, + { + "epoch": 4.815832515538109, + "grad_norm": 0.9143913388252258, + "learning_rate": 6.872266406219562e-05, + "loss": 0.0739, + "step": 73610 + }, + { + "epoch": 4.81648675171737, + "grad_norm": 0.973491370677948, + "learning_rate": 6.871414611897379e-05, + "loss": 0.0572, + "step": 73620 + }, + { + "epoch": 4.8171409878966305, + "grad_norm": 0.913781464099884, + "learning_rate": 6.870562754410044e-05, + "loss": 0.0646, + "step": 73630 + }, + { + "epoch": 4.8177952240758914, + "grad_norm": 1.1292165517807007, + "learning_rate": 6.86971083378631e-05, + "loss": 0.0538, + "step": 73640 + }, + { + "epoch": 4.818449460255152, + "grad_norm": 0.815707802772522, + "learning_rate": 6.868858850054933e-05, + "loss": 0.0577, + "step": 73650 + }, + { + "epoch": 4.8191036964344125, + "grad_norm": 1.1895596981048584, + "learning_rate": 6.868006803244669e-05, + "loss": 0.0534, + "step": 73660 + }, + { + "epoch": 4.8197579326136735, + "grad_norm": 0.672910749912262, + "learning_rate": 6.867154693384275e-05, + "loss": 0.053, + "step": 73670 + }, + { + "epoch": 4.820412168792934, + "grad_norm": 1.2378745079040527, + "learning_rate": 6.866302520502515e-05, + "loss": 0.0745, + "step": 73680 + }, + { + "epoch": 4.8210664049721945, + "grad_norm": 1.0225582122802734, + "learning_rate": 6.865450284628148e-05, + "loss": 0.0637, + "step": 73690 + }, + { + "epoch": 4.8217206411514555, + "grad_norm": 0.8776040077209473, + "learning_rate": 6.864597985789944e-05, + "loss": 0.0664, + "step": 73700 + }, + { + "epoch": 4.8223748773307165, + "grad_norm": 0.7907946705818176, + "learning_rate": 6.863745624016666e-05, + "loss": 0.078, + "step": 73710 + }, + { + "epoch": 4.823029113509977, + "grad_norm": 0.7620204091072083, + "learning_rate": 6.862893199337087e-05, + "loss": 0.0517, + "step": 73720 + }, + { + "epoch": 4.8236833496892375, + "grad_norm": 0.7737089991569519, + "learning_rate": 6.862040711779976e-05, + "loss": 0.0583, + "step": 73730 + }, + { + "epoch": 4.8243375858684985, + "grad_norm": 0.7591310739517212, + "learning_rate": 6.861188161374106e-05, + "loss": 0.0602, + "step": 73740 + }, + { + "epoch": 4.8249918220477594, + "grad_norm": 0.8480481505393982, + "learning_rate": 6.860335548148257e-05, + "loss": 0.0603, + "step": 73750 + }, + { + "epoch": 4.82564605822702, + "grad_norm": 0.8201214671134949, + "learning_rate": 6.859482872131203e-05, + "loss": 0.0617, + "step": 73760 + }, + { + "epoch": 4.8263002944062805, + "grad_norm": 1.0084785223007202, + "learning_rate": 6.858630133351726e-05, + "loss": 0.0647, + "step": 73770 + }, + { + "epoch": 4.8269545305855415, + "grad_norm": 0.8590355515480042, + "learning_rate": 6.857777331838607e-05, + "loss": 0.0621, + "step": 73780 + }, + { + "epoch": 4.827608766764802, + "grad_norm": 1.0885320901870728, + "learning_rate": 6.856924467620631e-05, + "loss": 0.0556, + "step": 73790 + }, + { + "epoch": 4.8282630029440625, + "grad_norm": 0.7727965116500854, + "learning_rate": 6.856071540726585e-05, + "loss": 0.0552, + "step": 73800 + }, + { + "epoch": 4.8289172391233235, + "grad_norm": 0.9133251309394836, + "learning_rate": 6.855218551185255e-05, + "loss": 0.0601, + "step": 73810 + }, + { + "epoch": 4.8295714753025845, + "grad_norm": 0.782835841178894, + "learning_rate": 6.854365499025435e-05, + "loss": 0.0563, + "step": 73820 + }, + { + "epoch": 4.8302257114818445, + "grad_norm": 1.1449490785598755, + "learning_rate": 6.853512384275916e-05, + "loss": 0.0622, + "step": 73830 + }, + { + "epoch": 4.8308799476611055, + "grad_norm": 0.7554023265838623, + "learning_rate": 6.852659206965493e-05, + "loss": 0.0579, + "step": 73840 + }, + { + "epoch": 4.8315341838403665, + "grad_norm": 1.0752381086349487, + "learning_rate": 6.851805967122962e-05, + "loss": 0.057, + "step": 73850 + }, + { + "epoch": 4.8321884200196275, + "grad_norm": 0.9013246893882751, + "learning_rate": 6.850952664777124e-05, + "loss": 0.0545, + "step": 73860 + }, + { + "epoch": 4.8328426561988875, + "grad_norm": 0.8916118741035461, + "learning_rate": 6.85009929995678e-05, + "loss": 0.0527, + "step": 73870 + }, + { + "epoch": 4.8334968923781485, + "grad_norm": 0.8229614496231079, + "learning_rate": 6.849245872690731e-05, + "loss": 0.0666, + "step": 73880 + }, + { + "epoch": 4.8341511285574095, + "grad_norm": 0.7096636891365051, + "learning_rate": 6.848392383007784e-05, + "loss": 0.0589, + "step": 73890 + }, + { + "epoch": 4.8348053647366696, + "grad_norm": 0.8488548398017883, + "learning_rate": 6.847538830936746e-05, + "loss": 0.0672, + "step": 73900 + }, + { + "epoch": 4.8354596009159305, + "grad_norm": 0.8226892948150635, + "learning_rate": 6.84668521650643e-05, + "loss": 0.0587, + "step": 73910 + }, + { + "epoch": 4.8361138370951915, + "grad_norm": 0.7988223433494568, + "learning_rate": 6.845831539745643e-05, + "loss": 0.0618, + "step": 73920 + }, + { + "epoch": 4.8367680732744525, + "grad_norm": 0.9851675033569336, + "learning_rate": 6.8449778006832e-05, + "loss": 0.057, + "step": 73930 + }, + { + "epoch": 4.8374223094537125, + "grad_norm": 1.0461887121200562, + "learning_rate": 6.84412399934792e-05, + "loss": 0.0602, + "step": 73940 + }, + { + "epoch": 4.8380765456329735, + "grad_norm": 1.085433006286621, + "learning_rate": 6.843270135768616e-05, + "loss": 0.0609, + "step": 73950 + }, + { + "epoch": 4.8387307818122345, + "grad_norm": 0.9901970028877258, + "learning_rate": 6.842416209974111e-05, + "loss": 0.0498, + "step": 73960 + }, + { + "epoch": 4.839385017991495, + "grad_norm": 0.7033235430717468, + "learning_rate": 6.841562221993228e-05, + "loss": 0.0648, + "step": 73970 + }, + { + "epoch": 4.8400392541707555, + "grad_norm": 0.8223658800125122, + "learning_rate": 6.84070817185479e-05, + "loss": 0.0568, + "step": 73980 + }, + { + "epoch": 4.8406934903500165, + "grad_norm": 0.8278485536575317, + "learning_rate": 6.839854059587624e-05, + "loss": 0.062, + "step": 73990 + }, + { + "epoch": 4.841347726529277, + "grad_norm": 0.8352931141853333, + "learning_rate": 6.838999885220558e-05, + "loss": 0.0645, + "step": 74000 + }, + { + "epoch": 4.8420019627085376, + "grad_norm": 1.1042554378509521, + "learning_rate": 6.838145648782422e-05, + "loss": 0.0612, + "step": 74010 + }, + { + "epoch": 4.8426561988877985, + "grad_norm": 0.8824191689491272, + "learning_rate": 6.837291350302052e-05, + "loss": 0.0611, + "step": 74020 + }, + { + "epoch": 4.8433104350670595, + "grad_norm": 0.9088771939277649, + "learning_rate": 6.836436989808278e-05, + "loss": 0.0644, + "step": 74030 + }, + { + "epoch": 4.84396467124632, + "grad_norm": 0.8344402313232422, + "learning_rate": 6.835582567329942e-05, + "loss": 0.0568, + "step": 74040 + }, + { + "epoch": 4.8446189074255805, + "grad_norm": 0.8632908463478088, + "learning_rate": 6.834728082895878e-05, + "loss": 0.0596, + "step": 74050 + }, + { + "epoch": 4.8452731436048415, + "grad_norm": 0.7749559879302979, + "learning_rate": 6.83387353653493e-05, + "loss": 0.062, + "step": 74060 + }, + { + "epoch": 4.8459273797841025, + "grad_norm": 0.9213570952415466, + "learning_rate": 6.83301892827594e-05, + "loss": 0.0571, + "step": 74070 + }, + { + "epoch": 4.846581615963363, + "grad_norm": 0.7469812035560608, + "learning_rate": 6.832164258147756e-05, + "loss": 0.0528, + "step": 74080 + }, + { + "epoch": 4.8472358521426235, + "grad_norm": 0.6800960898399353, + "learning_rate": 6.831309526179221e-05, + "loss": 0.063, + "step": 74090 + }, + { + "epoch": 4.8478900883218845, + "grad_norm": 0.8719648122787476, + "learning_rate": 6.830454732399188e-05, + "loss": 0.0532, + "step": 74100 + }, + { + "epoch": 4.848544324501145, + "grad_norm": 0.9584552645683289, + "learning_rate": 6.829599876836507e-05, + "loss": 0.0636, + "step": 74110 + }, + { + "epoch": 4.849198560680406, + "grad_norm": 0.9922688603401184, + "learning_rate": 6.828744959520031e-05, + "loss": 0.0601, + "step": 74120 + }, + { + "epoch": 4.8498527968596665, + "grad_norm": 0.8676353693008423, + "learning_rate": 6.82788998047862e-05, + "loss": 0.0577, + "step": 74130 + }, + { + "epoch": 4.850507033038927, + "grad_norm": 0.9001226425170898, + "learning_rate": 6.827034939741125e-05, + "loss": 0.0597, + "step": 74140 + }, + { + "epoch": 4.851161269218188, + "grad_norm": 0.8655483722686768, + "learning_rate": 6.82617983733641e-05, + "loss": 0.0614, + "step": 74150 + }, + { + "epoch": 4.8518155053974485, + "grad_norm": 0.9211030602455139, + "learning_rate": 6.825324673293336e-05, + "loss": 0.0545, + "step": 74160 + }, + { + "epoch": 4.8524697415767095, + "grad_norm": 0.8237171173095703, + "learning_rate": 6.824469447640766e-05, + "loss": 0.0544, + "step": 74170 + }, + { + "epoch": 4.85312397775597, + "grad_norm": 1.0644043684005737, + "learning_rate": 6.82361416040757e-05, + "loss": 0.0702, + "step": 74180 + }, + { + "epoch": 4.853778213935231, + "grad_norm": 0.8750390410423279, + "learning_rate": 6.82275881162261e-05, + "loss": 0.0582, + "step": 74190 + }, + { + "epoch": 4.8544324501144915, + "grad_norm": 0.9315994381904602, + "learning_rate": 6.821903401314764e-05, + "loss": 0.0555, + "step": 74200 + }, + { + "epoch": 4.8550866862937525, + "grad_norm": 1.0597717761993408, + "learning_rate": 6.821047929512898e-05, + "loss": 0.0574, + "step": 74210 + }, + { + "epoch": 4.855740922473013, + "grad_norm": 0.8364295959472656, + "learning_rate": 6.820192396245886e-05, + "loss": 0.0573, + "step": 74220 + }, + { + "epoch": 4.856395158652274, + "grad_norm": 0.7751799821853638, + "learning_rate": 6.81933680154261e-05, + "loss": 0.0636, + "step": 74230 + }, + { + "epoch": 4.8570493948315345, + "grad_norm": 0.9654813408851624, + "learning_rate": 6.818481145431947e-05, + "loss": 0.0612, + "step": 74240 + }, + { + "epoch": 4.857703631010795, + "grad_norm": 0.922402024269104, + "learning_rate": 6.817625427942773e-05, + "loss": 0.058, + "step": 74250 + }, + { + "epoch": 4.858357867190056, + "grad_norm": 0.663686990737915, + "learning_rate": 6.816769649103974e-05, + "loss": 0.0621, + "step": 74260 + }, + { + "epoch": 4.8590121033693165, + "grad_norm": 1.038624882698059, + "learning_rate": 6.815913808944436e-05, + "loss": 0.0532, + "step": 74270 + }, + { + "epoch": 4.859666339548577, + "grad_norm": 0.7460023760795593, + "learning_rate": 6.815057907493045e-05, + "loss": 0.066, + "step": 74280 + }, + { + "epoch": 4.860320575727838, + "grad_norm": 0.8232451677322388, + "learning_rate": 6.814201944778689e-05, + "loss": 0.0635, + "step": 74290 + }, + { + "epoch": 4.860974811907099, + "grad_norm": 0.7617900967597961, + "learning_rate": 6.81334592083026e-05, + "loss": 0.0578, + "step": 74300 + }, + { + "epoch": 4.8616290480863595, + "grad_norm": 0.8007754683494568, + "learning_rate": 6.81248983567665e-05, + "loss": 0.0553, + "step": 74310 + }, + { + "epoch": 4.86228328426562, + "grad_norm": 0.8569623231887817, + "learning_rate": 6.811633689346752e-05, + "loss": 0.0563, + "step": 74320 + }, + { + "epoch": 4.862937520444881, + "grad_norm": 0.979371964931488, + "learning_rate": 6.810777481869471e-05, + "loss": 0.059, + "step": 74330 + }, + { + "epoch": 4.863591756624142, + "grad_norm": 0.7974584102630615, + "learning_rate": 6.809921213273697e-05, + "loss": 0.0644, + "step": 74340 + }, + { + "epoch": 4.864245992803402, + "grad_norm": 0.9594705104827881, + "learning_rate": 6.809064883588336e-05, + "loss": 0.0618, + "step": 74350 + }, + { + "epoch": 4.864900228982663, + "grad_norm": 0.8043680191040039, + "learning_rate": 6.808208492842291e-05, + "loss": 0.0581, + "step": 74360 + }, + { + "epoch": 4.865554465161924, + "grad_norm": 0.8218439221382141, + "learning_rate": 6.807352041064467e-05, + "loss": 0.0607, + "step": 74370 + }, + { + "epoch": 4.8662087013411846, + "grad_norm": 1.0335355997085571, + "learning_rate": 6.806495528283771e-05, + "loss": 0.0679, + "step": 74380 + }, + { + "epoch": 4.866862937520445, + "grad_norm": 1.074435830116272, + "learning_rate": 6.805638954529117e-05, + "loss": 0.0628, + "step": 74390 + }, + { + "epoch": 4.867517173699706, + "grad_norm": 1.0719654560089111, + "learning_rate": 6.80478231982941e-05, + "loss": 0.0632, + "step": 74400 + }, + { + "epoch": 4.868171409878967, + "grad_norm": 0.9550548791885376, + "learning_rate": 6.803925624213565e-05, + "loss": 0.0554, + "step": 74410 + }, + { + "epoch": 4.868825646058227, + "grad_norm": 0.9627480506896973, + "learning_rate": 6.803068867710503e-05, + "loss": 0.0676, + "step": 74420 + }, + { + "epoch": 4.869479882237488, + "grad_norm": 1.0203245878219604, + "learning_rate": 6.802212050349135e-05, + "loss": 0.0535, + "step": 74430 + }, + { + "epoch": 4.870134118416749, + "grad_norm": 0.996152937412262, + "learning_rate": 6.801355172158385e-05, + "loss": 0.0572, + "step": 74440 + }, + { + "epoch": 4.870788354596009, + "grad_norm": 0.8001197576522827, + "learning_rate": 6.800498233167172e-05, + "loss": 0.06, + "step": 74450 + }, + { + "epoch": 4.87144259077527, + "grad_norm": 1.0456775426864624, + "learning_rate": 6.799641233404423e-05, + "loss": 0.0618, + "step": 74460 + }, + { + "epoch": 4.872096826954531, + "grad_norm": 0.9173516631126404, + "learning_rate": 6.798784172899064e-05, + "loss": 0.0584, + "step": 74470 + }, + { + "epoch": 4.872751063133792, + "grad_norm": 1.2566989660263062, + "learning_rate": 6.79792705168002e-05, + "loss": 0.0665, + "step": 74480 + }, + { + "epoch": 4.873405299313052, + "grad_norm": 0.9484615921974182, + "learning_rate": 6.797069869776222e-05, + "loss": 0.0643, + "step": 74490 + }, + { + "epoch": 4.874059535492313, + "grad_norm": 0.8985061049461365, + "learning_rate": 6.796212627216605e-05, + "loss": 0.0664, + "step": 74500 + }, + { + "epoch": 4.874713771671574, + "grad_norm": 0.8476536273956299, + "learning_rate": 6.795355324030099e-05, + "loss": 0.0543, + "step": 74510 + }, + { + "epoch": 4.875368007850835, + "grad_norm": 1.0744060277938843, + "learning_rate": 6.794497960245644e-05, + "loss": 0.0656, + "step": 74520 + }, + { + "epoch": 4.876022244030095, + "grad_norm": 1.001219391822815, + "learning_rate": 6.793640535892176e-05, + "loss": 0.063, + "step": 74530 + }, + { + "epoch": 4.876676480209356, + "grad_norm": 1.04497492313385, + "learning_rate": 6.792783050998637e-05, + "loss": 0.0629, + "step": 74540 + }, + { + "epoch": 4.877330716388617, + "grad_norm": 1.1128970384597778, + "learning_rate": 6.791925505593965e-05, + "loss": 0.0598, + "step": 74550 + }, + { + "epoch": 4.877984952567877, + "grad_norm": 0.9689093828201294, + "learning_rate": 6.791067899707113e-05, + "loss": 0.0577, + "step": 74560 + }, + { + "epoch": 4.878639188747138, + "grad_norm": 1.0436021089553833, + "learning_rate": 6.790210233367019e-05, + "loss": 0.0557, + "step": 74570 + }, + { + "epoch": 4.879293424926399, + "grad_norm": 0.6721805334091187, + "learning_rate": 6.789352506602632e-05, + "loss": 0.0665, + "step": 74580 + }, + { + "epoch": 4.879947661105659, + "grad_norm": 0.8921915888786316, + "learning_rate": 6.78849471944291e-05, + "loss": 0.0541, + "step": 74590 + }, + { + "epoch": 4.88060189728492, + "grad_norm": 0.7612684965133667, + "learning_rate": 6.787636871916798e-05, + "loss": 0.0611, + "step": 74600 + }, + { + "epoch": 4.881256133464181, + "grad_norm": 0.7255061268806458, + "learning_rate": 6.786778964053253e-05, + "loss": 0.068, + "step": 74610 + }, + { + "epoch": 4.881910369643442, + "grad_norm": 0.7865713834762573, + "learning_rate": 6.785920995881234e-05, + "loss": 0.0665, + "step": 74620 + }, + { + "epoch": 4.882564605822702, + "grad_norm": 0.8941336274147034, + "learning_rate": 6.785062967429697e-05, + "loss": 0.0531, + "step": 74630 + }, + { + "epoch": 4.883218842001963, + "grad_norm": 0.8453198671340942, + "learning_rate": 6.784204878727601e-05, + "loss": 0.0552, + "step": 74640 + }, + { + "epoch": 4.883873078181224, + "grad_norm": 0.9211252927780151, + "learning_rate": 6.783346729803913e-05, + "loss": 0.064, + "step": 74650 + }, + { + "epoch": 4.884527314360485, + "grad_norm": 0.7787407040596008, + "learning_rate": 6.782488520687596e-05, + "loss": 0.064, + "step": 74660 + }, + { + "epoch": 4.885181550539745, + "grad_norm": 0.8281296491622925, + "learning_rate": 6.781630251407617e-05, + "loss": 0.0531, + "step": 74670 + }, + { + "epoch": 4.885835786719006, + "grad_norm": 0.9956787824630737, + "learning_rate": 6.780771921992945e-05, + "loss": 0.0556, + "step": 74680 + }, + { + "epoch": 4.886490022898267, + "grad_norm": 1.0824602842330933, + "learning_rate": 6.779913532472548e-05, + "loss": 0.0639, + "step": 74690 + }, + { + "epoch": 4.887144259077527, + "grad_norm": 0.9609588980674744, + "learning_rate": 6.779055082875403e-05, + "loss": 0.0633, + "step": 74700 + }, + { + "epoch": 4.887798495256788, + "grad_norm": 0.9233350157737732, + "learning_rate": 6.778196573230481e-05, + "loss": 0.0671, + "step": 74710 + }, + { + "epoch": 4.888452731436049, + "grad_norm": 0.7840463519096375, + "learning_rate": 6.777338003566765e-05, + "loss": 0.0555, + "step": 74720 + }, + { + "epoch": 4.889106967615309, + "grad_norm": 0.8873052000999451, + "learning_rate": 6.776479373913228e-05, + "loss": 0.0642, + "step": 74730 + }, + { + "epoch": 4.88976120379457, + "grad_norm": 1.742559790611267, + "learning_rate": 6.775620684298853e-05, + "loss": 0.0636, + "step": 74740 + }, + { + "epoch": 4.890415439973831, + "grad_norm": 1.021909475326538, + "learning_rate": 6.774761934752624e-05, + "loss": 0.0568, + "step": 74750 + }, + { + "epoch": 4.891069676153092, + "grad_norm": 0.7874606847763062, + "learning_rate": 6.773903125303524e-05, + "loss": 0.0537, + "step": 74760 + }, + { + "epoch": 4.891723912332352, + "grad_norm": 0.8984439969062805, + "learning_rate": 6.773044255980543e-05, + "loss": 0.071, + "step": 74770 + }, + { + "epoch": 4.892378148511613, + "grad_norm": 0.8923892378807068, + "learning_rate": 6.772185326812668e-05, + "loss": 0.0552, + "step": 74780 + }, + { + "epoch": 4.893032384690874, + "grad_norm": 1.0217084884643555, + "learning_rate": 6.77132633782889e-05, + "loss": 0.0557, + "step": 74790 + }, + { + "epoch": 4.893686620870134, + "grad_norm": 0.9023916721343994, + "learning_rate": 6.770467289058203e-05, + "loss": 0.065, + "step": 74800 + }, + { + "epoch": 4.894340857049395, + "grad_norm": 1.1047594547271729, + "learning_rate": 6.7696081805296e-05, + "loss": 0.0521, + "step": 74810 + }, + { + "epoch": 4.894995093228656, + "grad_norm": 1.103213906288147, + "learning_rate": 6.768749012272081e-05, + "loss": 0.0691, + "step": 74820 + }, + { + "epoch": 4.895649329407917, + "grad_norm": 0.9325263500213623, + "learning_rate": 6.767889784314645e-05, + "loss": 0.0625, + "step": 74830 + }, + { + "epoch": 4.896303565587177, + "grad_norm": 1.620137333869934, + "learning_rate": 6.76703049668629e-05, + "loss": 0.0641, + "step": 74840 + }, + { + "epoch": 4.896957801766438, + "grad_norm": 1.0535722970962524, + "learning_rate": 6.766171149416023e-05, + "loss": 0.052, + "step": 74850 + }, + { + "epoch": 4.897612037945699, + "grad_norm": 0.9076412320137024, + "learning_rate": 6.765311742532849e-05, + "loss": 0.0637, + "step": 74860 + }, + { + "epoch": 4.898266274124959, + "grad_norm": 1.7605758905410767, + "learning_rate": 6.764452276065774e-05, + "loss": 0.0574, + "step": 74870 + }, + { + "epoch": 4.89892051030422, + "grad_norm": 0.7411386966705322, + "learning_rate": 6.763592750043805e-05, + "loss": 0.0568, + "step": 74880 + }, + { + "epoch": 4.899574746483481, + "grad_norm": 0.8387102484703064, + "learning_rate": 6.762733164495956e-05, + "loss": 0.0567, + "step": 74890 + }, + { + "epoch": 4.900228982662741, + "grad_norm": 0.9322872161865234, + "learning_rate": 6.761873519451241e-05, + "loss": 0.0554, + "step": 74900 + }, + { + "epoch": 4.900883218842002, + "grad_norm": 0.9480436444282532, + "learning_rate": 6.761013814938673e-05, + "loss": 0.0575, + "step": 74910 + }, + { + "epoch": 4.901537455021263, + "grad_norm": 0.8049683570861816, + "learning_rate": 6.760154050987272e-05, + "loss": 0.0543, + "step": 74920 + }, + { + "epoch": 4.902191691200524, + "grad_norm": 1.016456961631775, + "learning_rate": 6.759294227626054e-05, + "loss": 0.0538, + "step": 74930 + }, + { + "epoch": 4.902845927379784, + "grad_norm": 0.9667950868606567, + "learning_rate": 6.758434344884042e-05, + "loss": 0.0569, + "step": 74940 + }, + { + "epoch": 4.903500163559045, + "grad_norm": 0.902862548828125, + "learning_rate": 6.75757440279026e-05, + "loss": 0.0586, + "step": 74950 + }, + { + "epoch": 4.904154399738306, + "grad_norm": 0.7040520906448364, + "learning_rate": 6.756714401373732e-05, + "loss": 0.0593, + "step": 74960 + }, + { + "epoch": 4.904808635917567, + "grad_norm": 0.824975848197937, + "learning_rate": 6.755854340663484e-05, + "loss": 0.0565, + "step": 74970 + }, + { + "epoch": 4.905462872096827, + "grad_norm": 0.8577737212181091, + "learning_rate": 6.754994220688551e-05, + "loss": 0.0581, + "step": 74980 + }, + { + "epoch": 4.906117108276088, + "grad_norm": 0.8069767951965332, + "learning_rate": 6.754134041477957e-05, + "loss": 0.0481, + "step": 74990 + }, + { + "epoch": 4.906771344455349, + "grad_norm": 0.7826264500617981, + "learning_rate": 6.75327380306074e-05, + "loss": 0.0642, + "step": 75000 + }, + { + "epoch": 4.907425580634609, + "grad_norm": 0.895269513130188, + "learning_rate": 6.752413505465935e-05, + "loss": 0.0549, + "step": 75010 + }, + { + "epoch": 4.90807981681387, + "grad_norm": 0.6955884099006653, + "learning_rate": 6.751553148722576e-05, + "loss": 0.0546, + "step": 75020 + }, + { + "epoch": 4.908734052993131, + "grad_norm": 0.8068095445632935, + "learning_rate": 6.750692732859706e-05, + "loss": 0.0569, + "step": 75030 + }, + { + "epoch": 4.909388289172391, + "grad_norm": 0.6808393001556396, + "learning_rate": 6.749832257906365e-05, + "loss": 0.0557, + "step": 75040 + }, + { + "epoch": 4.910042525351652, + "grad_norm": 0.9451243281364441, + "learning_rate": 6.748971723891597e-05, + "loss": 0.0634, + "step": 75050 + }, + { + "epoch": 4.910696761530913, + "grad_norm": 0.7461559772491455, + "learning_rate": 6.748111130844445e-05, + "loss": 0.062, + "step": 75060 + }, + { + "epoch": 4.911350997710174, + "grad_norm": 0.8373140096664429, + "learning_rate": 6.747250478793959e-05, + "loss": 0.0538, + "step": 75070 + }, + { + "epoch": 4.912005233889434, + "grad_norm": 0.9267625212669373, + "learning_rate": 6.746389767769185e-05, + "loss": 0.0657, + "step": 75080 + }, + { + "epoch": 4.912659470068695, + "grad_norm": 0.8543652892112732, + "learning_rate": 6.745528997799178e-05, + "loss": 0.0612, + "step": 75090 + }, + { + "epoch": 4.913313706247956, + "grad_norm": 0.9111425280570984, + "learning_rate": 6.744668168912989e-05, + "loss": 0.0604, + "step": 75100 + }, + { + "epoch": 4.913967942427217, + "grad_norm": 1.040820598602295, + "learning_rate": 6.743807281139675e-05, + "loss": 0.0614, + "step": 75110 + }, + { + "epoch": 4.914622178606477, + "grad_norm": 0.8948336839675903, + "learning_rate": 6.74294633450829e-05, + "loss": 0.0527, + "step": 75120 + }, + { + "epoch": 4.915276414785738, + "grad_norm": 0.923996090888977, + "learning_rate": 6.742085329047895e-05, + "loss": 0.0636, + "step": 75130 + }, + { + "epoch": 4.915930650964999, + "grad_norm": 0.8746695518493652, + "learning_rate": 6.741224264787553e-05, + "loss": 0.0587, + "step": 75140 + }, + { + "epoch": 4.916584887144259, + "grad_norm": 0.7787982821464539, + "learning_rate": 6.740363141756325e-05, + "loss": 0.0542, + "step": 75150 + }, + { + "epoch": 4.91723912332352, + "grad_norm": 0.9488137364387512, + "learning_rate": 6.739501959983277e-05, + "loss": 0.0602, + "step": 75160 + }, + { + "epoch": 4.917893359502781, + "grad_norm": 0.7842972874641418, + "learning_rate": 6.738640719497475e-05, + "loss": 0.0538, + "step": 75170 + }, + { + "epoch": 4.918547595682041, + "grad_norm": 0.9717864394187927, + "learning_rate": 6.73777942032799e-05, + "loss": 0.0614, + "step": 75180 + }, + { + "epoch": 4.919201831861302, + "grad_norm": 0.7789155840873718, + "learning_rate": 6.736918062503889e-05, + "loss": 0.068, + "step": 75190 + }, + { + "epoch": 4.919856068040563, + "grad_norm": 0.8811984062194824, + "learning_rate": 6.736056646054251e-05, + "loss": 0.0531, + "step": 75200 + }, + { + "epoch": 4.920510304219824, + "grad_norm": 0.9033105969429016, + "learning_rate": 6.73519517100815e-05, + "loss": 0.0591, + "step": 75210 + }, + { + "epoch": 4.921164540399084, + "grad_norm": 0.7949878573417664, + "learning_rate": 6.734333637394657e-05, + "loss": 0.0538, + "step": 75220 + }, + { + "epoch": 4.921818776578345, + "grad_norm": 0.9282236099243164, + "learning_rate": 6.73347204524286e-05, + "loss": 0.0672, + "step": 75230 + }, + { + "epoch": 4.922473012757606, + "grad_norm": 0.8523067831993103, + "learning_rate": 6.732610394581831e-05, + "loss": 0.0512, + "step": 75240 + }, + { + "epoch": 4.923127248936867, + "grad_norm": 0.9058271646499634, + "learning_rate": 6.731748685440658e-05, + "loss": 0.064, + "step": 75250 + }, + { + "epoch": 4.923781485116127, + "grad_norm": 0.7754740715026855, + "learning_rate": 6.730886917848426e-05, + "loss": 0.06, + "step": 75260 + }, + { + "epoch": 4.924435721295388, + "grad_norm": 0.9523494839668274, + "learning_rate": 6.730025091834223e-05, + "loss": 0.062, + "step": 75270 + }, + { + "epoch": 4.925089957474649, + "grad_norm": 0.825276255607605, + "learning_rate": 6.729163207427134e-05, + "loss": 0.0568, + "step": 75280 + }, + { + "epoch": 4.925744193653909, + "grad_norm": 1.0787001848220825, + "learning_rate": 6.728301264656251e-05, + "loss": 0.0591, + "step": 75290 + }, + { + "epoch": 4.92639842983317, + "grad_norm": 0.859107494354248, + "learning_rate": 6.727439263550669e-05, + "loss": 0.0568, + "step": 75300 + }, + { + "epoch": 4.927052666012431, + "grad_norm": 0.897099494934082, + "learning_rate": 6.726577204139482e-05, + "loss": 0.063, + "step": 75310 + }, + { + "epoch": 4.927706902191691, + "grad_norm": 0.7519966959953308, + "learning_rate": 6.725715086451784e-05, + "loss": 0.0582, + "step": 75320 + }, + { + "epoch": 4.928361138370952, + "grad_norm": 0.7944568991661072, + "learning_rate": 6.724852910516677e-05, + "loss": 0.0605, + "step": 75330 + }, + { + "epoch": 4.929015374550213, + "grad_norm": 1.1941640377044678, + "learning_rate": 6.723990676363262e-05, + "loss": 0.065, + "step": 75340 + }, + { + "epoch": 4.929669610729473, + "grad_norm": 0.7790656089782715, + "learning_rate": 6.723128384020638e-05, + "loss": 0.0591, + "step": 75350 + }, + { + "epoch": 4.930323846908734, + "grad_norm": 0.9245325326919556, + "learning_rate": 6.722266033517913e-05, + "loss": 0.0511, + "step": 75360 + }, + { + "epoch": 4.930978083087995, + "grad_norm": 0.9800746440887451, + "learning_rate": 6.721403624884194e-05, + "loss": 0.0634, + "step": 75370 + }, + { + "epoch": 4.931632319267256, + "grad_norm": 0.7326732277870178, + "learning_rate": 6.720541158148587e-05, + "loss": 0.0555, + "step": 75380 + }, + { + "epoch": 4.932286555446516, + "grad_norm": 0.8972048759460449, + "learning_rate": 6.719678633340202e-05, + "loss": 0.0515, + "step": 75390 + }, + { + "epoch": 4.932940791625777, + "grad_norm": 0.9582744240760803, + "learning_rate": 6.718816050488157e-05, + "loss": 0.0626, + "step": 75400 + }, + { + "epoch": 4.933595027805038, + "grad_norm": 0.7547270655632019, + "learning_rate": 6.717953409621559e-05, + "loss": 0.0576, + "step": 75410 + }, + { + "epoch": 4.934249263984299, + "grad_norm": 1.1005964279174805, + "learning_rate": 6.71709071076953e-05, + "loss": 0.0601, + "step": 75420 + }, + { + "epoch": 4.934903500163559, + "grad_norm": 0.8800162672996521, + "learning_rate": 6.716227953961185e-05, + "loss": 0.0592, + "step": 75430 + }, + { + "epoch": 4.93555773634282, + "grad_norm": 0.7670087814331055, + "learning_rate": 6.715365139225647e-05, + "loss": 0.06, + "step": 75440 + }, + { + "epoch": 4.936211972522081, + "grad_norm": 0.9640793204307556, + "learning_rate": 6.714502266592034e-05, + "loss": 0.0634, + "step": 75450 + }, + { + "epoch": 4.936866208701341, + "grad_norm": 0.8268434405326843, + "learning_rate": 6.713639336089476e-05, + "loss": 0.0591, + "step": 75460 + }, + { + "epoch": 4.937520444880602, + "grad_norm": 1.175800085067749, + "learning_rate": 6.712776347747096e-05, + "loss": 0.0675, + "step": 75470 + }, + { + "epoch": 4.938174681059863, + "grad_norm": 0.8210011720657349, + "learning_rate": 6.71191330159402e-05, + "loss": 0.0622, + "step": 75480 + }, + { + "epoch": 4.938828917239123, + "grad_norm": 0.8271769881248474, + "learning_rate": 6.711050197659384e-05, + "loss": 0.0598, + "step": 75490 + }, + { + "epoch": 4.939483153418384, + "grad_norm": 0.8794811964035034, + "learning_rate": 6.710187035972314e-05, + "loss": 0.0586, + "step": 75500 + }, + { + "epoch": 4.940137389597645, + "grad_norm": 0.8313082456588745, + "learning_rate": 6.709323816561946e-05, + "loss": 0.061, + "step": 75510 + }, + { + "epoch": 4.940791625776906, + "grad_norm": 0.9724137783050537, + "learning_rate": 6.708460539457418e-05, + "loss": 0.0639, + "step": 75520 + }, + { + "epoch": 4.941445861956166, + "grad_norm": 0.9716455936431885, + "learning_rate": 6.707597204687865e-05, + "loss": 0.0561, + "step": 75530 + }, + { + "epoch": 4.942100098135427, + "grad_norm": 0.6842145323753357, + "learning_rate": 6.706733812282428e-05, + "loss": 0.0518, + "step": 75540 + }, + { + "epoch": 4.942754334314688, + "grad_norm": 0.767361581325531, + "learning_rate": 6.705870362270248e-05, + "loss": 0.0609, + "step": 75550 + }, + { + "epoch": 4.943408570493949, + "grad_norm": 0.9804648756980896, + "learning_rate": 6.705006854680471e-05, + "loss": 0.066, + "step": 75560 + }, + { + "epoch": 4.944062806673209, + "grad_norm": 0.6798174977302551, + "learning_rate": 6.704143289542241e-05, + "loss": 0.0534, + "step": 75570 + }, + { + "epoch": 4.94471704285247, + "grad_norm": 0.8720900416374207, + "learning_rate": 6.703279666884705e-05, + "loss": 0.0508, + "step": 75580 + }, + { + "epoch": 4.945371279031731, + "grad_norm": 1.0530959367752075, + "learning_rate": 6.702415986737014e-05, + "loss": 0.0507, + "step": 75590 + }, + { + "epoch": 4.946025515210991, + "grad_norm": 0.9512234330177307, + "learning_rate": 6.701552249128318e-05, + "loss": 0.0578, + "step": 75600 + }, + { + "epoch": 4.946679751390252, + "grad_norm": 0.7706218361854553, + "learning_rate": 6.70068845408777e-05, + "loss": 0.0596, + "step": 75610 + }, + { + "epoch": 4.947333987569513, + "grad_norm": 0.896196722984314, + "learning_rate": 6.69982460164453e-05, + "loss": 0.0652, + "step": 75620 + }, + { + "epoch": 4.947988223748773, + "grad_norm": 0.8798102140426636, + "learning_rate": 6.69896069182775e-05, + "loss": 0.0601, + "step": 75630 + }, + { + "epoch": 4.948642459928034, + "grad_norm": 0.9904597997665405, + "learning_rate": 6.69809672466659e-05, + "loss": 0.0522, + "step": 75640 + }, + { + "epoch": 4.949296696107295, + "grad_norm": 0.8591594696044922, + "learning_rate": 6.697232700190213e-05, + "loss": 0.0663, + "step": 75650 + }, + { + "epoch": 4.949950932286556, + "grad_norm": 0.8061467409133911, + "learning_rate": 6.696368618427779e-05, + "loss": 0.0626, + "step": 75660 + }, + { + "epoch": 4.950605168465816, + "grad_norm": 0.7854329943656921, + "learning_rate": 6.695504479408458e-05, + "loss": 0.0545, + "step": 75670 + }, + { + "epoch": 4.951259404645077, + "grad_norm": 0.7829649448394775, + "learning_rate": 6.694640283161413e-05, + "loss": 0.0505, + "step": 75680 + }, + { + "epoch": 4.951913640824338, + "grad_norm": 0.6959307193756104, + "learning_rate": 6.693776029715814e-05, + "loss": 0.0623, + "step": 75690 + }, + { + "epoch": 4.952567877003599, + "grad_norm": 0.7448030710220337, + "learning_rate": 6.692911719100833e-05, + "loss": 0.0537, + "step": 75700 + }, + { + "epoch": 4.953222113182859, + "grad_norm": 0.686690628528595, + "learning_rate": 6.692047351345641e-05, + "loss": 0.0572, + "step": 75710 + }, + { + "epoch": 4.95387634936212, + "grad_norm": 1.1191184520721436, + "learning_rate": 6.691182926479413e-05, + "loss": 0.0699, + "step": 75720 + }, + { + "epoch": 4.954530585541381, + "grad_norm": 0.7493934631347656, + "learning_rate": 6.690318444531328e-05, + "loss": 0.0585, + "step": 75730 + }, + { + "epoch": 4.955184821720641, + "grad_norm": 1.0006502866744995, + "learning_rate": 6.689453905530559e-05, + "loss": 0.0562, + "step": 75740 + }, + { + "epoch": 4.955839057899902, + "grad_norm": 0.9580645561218262, + "learning_rate": 6.688589309506292e-05, + "loss": 0.0582, + "step": 75750 + }, + { + "epoch": 4.956493294079163, + "grad_norm": 1.0518616437911987, + "learning_rate": 6.687724656487707e-05, + "loss": 0.0699, + "step": 75760 + }, + { + "epoch": 4.957147530258423, + "grad_norm": 1.082767128944397, + "learning_rate": 6.686859946503989e-05, + "loss": 0.055, + "step": 75770 + }, + { + "epoch": 4.957801766437684, + "grad_norm": 0.933538556098938, + "learning_rate": 6.685995179584324e-05, + "loss": 0.0637, + "step": 75780 + }, + { + "epoch": 4.958456002616945, + "grad_norm": 0.955034077167511, + "learning_rate": 6.685130355757899e-05, + "loss": 0.0596, + "step": 75790 + }, + { + "epoch": 4.959110238796205, + "grad_norm": 0.9905108213424683, + "learning_rate": 6.684265475053905e-05, + "loss": 0.068, + "step": 75800 + }, + { + "epoch": 4.959764474975466, + "grad_norm": 0.7588913440704346, + "learning_rate": 6.683400537501534e-05, + "loss": 0.054, + "step": 75810 + }, + { + "epoch": 4.960418711154727, + "grad_norm": 0.8682058453559875, + "learning_rate": 6.68253554312998e-05, + "loss": 0.0569, + "step": 75820 + }, + { + "epoch": 4.961072947333988, + "grad_norm": 0.8857309818267822, + "learning_rate": 6.68167049196844e-05, + "loss": 0.0706, + "step": 75830 + }, + { + "epoch": 4.961727183513248, + "grad_norm": 0.8646987676620483, + "learning_rate": 6.680805384046109e-05, + "loss": 0.0552, + "step": 75840 + }, + { + "epoch": 4.962381419692509, + "grad_norm": 0.9026349782943726, + "learning_rate": 6.67994021939219e-05, + "loss": 0.0506, + "step": 75850 + }, + { + "epoch": 4.96303565587177, + "grad_norm": 0.9483004808425903, + "learning_rate": 6.679074998035881e-05, + "loss": 0.0557, + "step": 75860 + }, + { + "epoch": 4.963689892051031, + "grad_norm": 0.9569962024688721, + "learning_rate": 6.67820972000639e-05, + "loss": 0.0614, + "step": 75870 + }, + { + "epoch": 4.964344128230291, + "grad_norm": 0.9677038192749023, + "learning_rate": 6.677344385332918e-05, + "loss": 0.063, + "step": 75880 + }, + { + "epoch": 4.964998364409552, + "grad_norm": 0.8951756358146667, + "learning_rate": 6.676478994044673e-05, + "loss": 0.0585, + "step": 75890 + }, + { + "epoch": 4.965652600588813, + "grad_norm": 0.9659004211425781, + "learning_rate": 6.675613546170866e-05, + "loss": 0.0542, + "step": 75900 + }, + { + "epoch": 4.966306836768073, + "grad_norm": 0.7111480236053467, + "learning_rate": 6.674748041740707e-05, + "loss": 0.0635, + "step": 75910 + }, + { + "epoch": 4.966961072947334, + "grad_norm": 0.9708228707313538, + "learning_rate": 6.673882480783412e-05, + "loss": 0.0607, + "step": 75920 + }, + { + "epoch": 4.967615309126595, + "grad_norm": 0.7749221324920654, + "learning_rate": 6.673016863328189e-05, + "loss": 0.0577, + "step": 75930 + }, + { + "epoch": 4.968269545305855, + "grad_norm": 0.9135736227035522, + "learning_rate": 6.672151189404262e-05, + "loss": 0.0608, + "step": 75940 + }, + { + "epoch": 4.968923781485116, + "grad_norm": 0.9059275388717651, + "learning_rate": 6.671285459040847e-05, + "loss": 0.0552, + "step": 75950 + }, + { + "epoch": 4.969578017664377, + "grad_norm": 0.874383270740509, + "learning_rate": 6.670419672267163e-05, + "loss": 0.066, + "step": 75960 + }, + { + "epoch": 4.970232253843638, + "grad_norm": 0.8657017946243286, + "learning_rate": 6.669553829112435e-05, + "loss": 0.0529, + "step": 75970 + }, + { + "epoch": 4.970886490022898, + "grad_norm": 0.919597864151001, + "learning_rate": 6.668687929605889e-05, + "loss": 0.0594, + "step": 75980 + }, + { + "epoch": 4.971540726202159, + "grad_norm": 0.8647698163986206, + "learning_rate": 6.667821973776747e-05, + "loss": 0.0531, + "step": 75990 + }, + { + "epoch": 4.97219496238142, + "grad_norm": 0.6683645248413086, + "learning_rate": 6.666955961654238e-05, + "loss": 0.058, + "step": 76000 + }, + { + "epoch": 4.972849198560681, + "grad_norm": 0.7765533924102783, + "learning_rate": 6.666089893267595e-05, + "loss": 0.0542, + "step": 76010 + }, + { + "epoch": 4.973503434739941, + "grad_norm": 0.8284790515899658, + "learning_rate": 6.665223768646049e-05, + "loss": 0.0565, + "step": 76020 + }, + { + "epoch": 4.974157670919202, + "grad_norm": 0.869004487991333, + "learning_rate": 6.664357587818832e-05, + "loss": 0.0616, + "step": 76030 + }, + { + "epoch": 4.974811907098463, + "grad_norm": 0.99537593126297, + "learning_rate": 6.663491350815184e-05, + "loss": 0.0506, + "step": 76040 + }, + { + "epoch": 4.975466143277723, + "grad_norm": 0.8469506502151489, + "learning_rate": 6.66262505766434e-05, + "loss": 0.0676, + "step": 76050 + }, + { + "epoch": 4.976120379456984, + "grad_norm": 0.9716391563415527, + "learning_rate": 6.661758708395537e-05, + "loss": 0.0547, + "step": 76060 + }, + { + "epoch": 4.976774615636245, + "grad_norm": 1.0621095895767212, + "learning_rate": 6.660892303038022e-05, + "loss": 0.0573, + "step": 76070 + }, + { + "epoch": 4.977428851815505, + "grad_norm": 0.9522149562835693, + "learning_rate": 6.660025841621035e-05, + "loss": 0.056, + "step": 76080 + }, + { + "epoch": 4.978083087994766, + "grad_norm": 1.1043899059295654, + "learning_rate": 6.659159324173823e-05, + "loss": 0.0607, + "step": 76090 + }, + { + "epoch": 4.978737324174027, + "grad_norm": 0.8276578783988953, + "learning_rate": 6.658292750725632e-05, + "loss": 0.0537, + "step": 76100 + }, + { + "epoch": 4.979391560353288, + "grad_norm": 0.8061304688453674, + "learning_rate": 6.657426121305711e-05, + "loss": 0.0655, + "step": 76110 + }, + { + "epoch": 4.980045796532548, + "grad_norm": 0.778200626373291, + "learning_rate": 6.656559435943313e-05, + "loss": 0.0538, + "step": 76120 + }, + { + "epoch": 4.980700032711809, + "grad_norm": 0.8858152031898499, + "learning_rate": 6.655692694667688e-05, + "loss": 0.0532, + "step": 76130 + }, + { + "epoch": 4.98135426889107, + "grad_norm": 0.631655216217041, + "learning_rate": 6.654825897508095e-05, + "loss": 0.0556, + "step": 76140 + }, + { + "epoch": 4.982008505070331, + "grad_norm": 0.7844308018684387, + "learning_rate": 6.653959044493785e-05, + "loss": 0.048, + "step": 76150 + }, + { + "epoch": 4.982662741249591, + "grad_norm": 1.1140578985214233, + "learning_rate": 6.65309213565402e-05, + "loss": 0.0558, + "step": 76160 + }, + { + "epoch": 4.983316977428852, + "grad_norm": 1.0043301582336426, + "learning_rate": 6.652225171018061e-05, + "loss": 0.0553, + "step": 76170 + }, + { + "epoch": 4.983971213608113, + "grad_norm": 0.8583059310913086, + "learning_rate": 6.65135815061517e-05, + "loss": 0.0594, + "step": 76180 + }, + { + "epoch": 4.984625449787373, + "grad_norm": 0.8297145366668701, + "learning_rate": 6.650491074474608e-05, + "loss": 0.0639, + "step": 76190 + }, + { + "epoch": 4.985279685966634, + "grad_norm": 0.7165802121162415, + "learning_rate": 6.649623942625647e-05, + "loss": 0.0607, + "step": 76200 + }, + { + "epoch": 4.985933922145895, + "grad_norm": 0.9171334505081177, + "learning_rate": 6.64875675509755e-05, + "loss": 0.0548, + "step": 76210 + }, + { + "epoch": 4.986588158325155, + "grad_norm": 1.0761557817459106, + "learning_rate": 6.647889511919588e-05, + "loss": 0.061, + "step": 76220 + }, + { + "epoch": 4.987242394504416, + "grad_norm": 0.9394139051437378, + "learning_rate": 6.647022213121035e-05, + "loss": 0.0599, + "step": 76230 + }, + { + "epoch": 4.987896630683677, + "grad_norm": 0.7724557518959045, + "learning_rate": 6.646154858731162e-05, + "loss": 0.0599, + "step": 76240 + }, + { + "epoch": 4.988550866862937, + "grad_norm": 0.7974488139152527, + "learning_rate": 6.645287448779243e-05, + "loss": 0.0499, + "step": 76250 + }, + { + "epoch": 4.989205103042198, + "grad_norm": 0.9333834648132324, + "learning_rate": 6.64441998329456e-05, + "loss": 0.0697, + "step": 76260 + }, + { + "epoch": 4.989859339221459, + "grad_norm": 0.9652996063232422, + "learning_rate": 6.64355246230639e-05, + "loss": 0.0572, + "step": 76270 + }, + { + "epoch": 4.99051357540072, + "grad_norm": 1.0794624090194702, + "learning_rate": 6.642684885844013e-05, + "loss": 0.0615, + "step": 76280 + }, + { + "epoch": 4.99116781157998, + "grad_norm": 0.8401366472244263, + "learning_rate": 6.641817253936713e-05, + "loss": 0.0558, + "step": 76290 + }, + { + "epoch": 4.991822047759241, + "grad_norm": 0.6911501288414001, + "learning_rate": 6.640949566613777e-05, + "loss": 0.0558, + "step": 76300 + }, + { + "epoch": 4.992476283938502, + "grad_norm": 0.7828950881958008, + "learning_rate": 6.640081823904487e-05, + "loss": 0.0589, + "step": 76310 + }, + { + "epoch": 4.993130520117763, + "grad_norm": 0.6949774622917175, + "learning_rate": 6.639214025838135e-05, + "loss": 0.0613, + "step": 76320 + }, + { + "epoch": 4.993784756297023, + "grad_norm": 0.9022195339202881, + "learning_rate": 6.638346172444011e-05, + "loss": 0.0609, + "step": 76330 + }, + { + "epoch": 4.994438992476284, + "grad_norm": 0.824991762638092, + "learning_rate": 6.637478263751407e-05, + "loss": 0.0599, + "step": 76340 + }, + { + "epoch": 4.995093228655545, + "grad_norm": 1.041867733001709, + "learning_rate": 6.636610299789616e-05, + "loss": 0.0673, + "step": 76350 + }, + { + "epoch": 4.995747464834805, + "grad_norm": 1.073553204536438, + "learning_rate": 6.635742280587935e-05, + "loss": 0.0762, + "step": 76360 + }, + { + "epoch": 4.996401701014066, + "grad_norm": 0.8955064415931702, + "learning_rate": 6.634874206175666e-05, + "loss": 0.0659, + "step": 76370 + }, + { + "epoch": 4.997055937193327, + "grad_norm": 1.0121138095855713, + "learning_rate": 6.6340060765821e-05, + "loss": 0.0524, + "step": 76380 + }, + { + "epoch": 4.997710173372587, + "grad_norm": 1.0602036714553833, + "learning_rate": 6.633137891836546e-05, + "loss": 0.0647, + "step": 76390 + }, + { + "epoch": 4.998364409551848, + "grad_norm": 0.947830319404602, + "learning_rate": 6.632269651968306e-05, + "loss": 0.0552, + "step": 76400 + }, + { + "epoch": 4.999018645731109, + "grad_norm": 0.9607699513435364, + "learning_rate": 6.631401357006683e-05, + "loss": 0.073, + "step": 76410 + }, + { + "epoch": 4.99967288191037, + "grad_norm": 0.7737681269645691, + "learning_rate": 6.630533006980986e-05, + "loss": 0.0596, + "step": 76420 + }, + { + "epoch": 5.00032711808963, + "grad_norm": 0.8846662044525146, + "learning_rate": 6.629664601920524e-05, + "loss": 0.057, + "step": 76430 + }, + { + "epoch": 5.000981354268891, + "grad_norm": 1.1258676052093506, + "learning_rate": 6.628796141854608e-05, + "loss": 0.0602, + "step": 76440 + }, + { + "epoch": 5.001635590448152, + "grad_norm": 0.844662070274353, + "learning_rate": 6.627927626812548e-05, + "loss": 0.0529, + "step": 76450 + }, + { + "epoch": 5.002289826627412, + "grad_norm": 0.9069094657897949, + "learning_rate": 6.627059056823665e-05, + "loss": 0.056, + "step": 76460 + }, + { + "epoch": 5.002944062806673, + "grad_norm": 0.8290544152259827, + "learning_rate": 6.62619043191727e-05, + "loss": 0.0576, + "step": 76470 + }, + { + "epoch": 5.003598298985934, + "grad_norm": 1.026877522468567, + "learning_rate": 6.625321752122682e-05, + "loss": 0.0594, + "step": 76480 + }, + { + "epoch": 5.004252535165195, + "grad_norm": 1.0628007650375366, + "learning_rate": 6.624453017469223e-05, + "loss": 0.0548, + "step": 76490 + }, + { + "epoch": 5.004906771344455, + "grad_norm": 0.9122229218482971, + "learning_rate": 6.623584227986215e-05, + "loss": 0.0582, + "step": 76500 + }, + { + "epoch": 5.005561007523716, + "grad_norm": 0.9323796629905701, + "learning_rate": 6.622715383702981e-05, + "loss": 0.0652, + "step": 76510 + }, + { + "epoch": 5.006215243702977, + "grad_norm": 0.9565145969390869, + "learning_rate": 6.621846484648849e-05, + "loss": 0.0614, + "step": 76520 + }, + { + "epoch": 5.006869479882237, + "grad_norm": 0.9995282292366028, + "learning_rate": 6.620977530853141e-05, + "loss": 0.0626, + "step": 76530 + }, + { + "epoch": 5.007523716061498, + "grad_norm": 1.09110689163208, + "learning_rate": 6.620108522345192e-05, + "loss": 0.0655, + "step": 76540 + }, + { + "epoch": 5.008177952240759, + "grad_norm": 0.8959344625473022, + "learning_rate": 6.619239459154331e-05, + "loss": 0.0594, + "step": 76550 + }, + { + "epoch": 5.00883218842002, + "grad_norm": 0.7433099150657654, + "learning_rate": 6.618370341309891e-05, + "loss": 0.0563, + "step": 76560 + }, + { + "epoch": 5.00948642459928, + "grad_norm": 0.8508242964744568, + "learning_rate": 6.61750116884121e-05, + "loss": 0.0621, + "step": 76570 + }, + { + "epoch": 5.010140660778541, + "grad_norm": 1.0113868713378906, + "learning_rate": 6.616631941777621e-05, + "loss": 0.0628, + "step": 76580 + }, + { + "epoch": 5.010794896957802, + "grad_norm": 0.8707073330879211, + "learning_rate": 6.615762660148464e-05, + "loss": 0.062, + "step": 76590 + }, + { + "epoch": 5.011449133137062, + "grad_norm": 0.8594477772712708, + "learning_rate": 6.61489332398308e-05, + "loss": 0.0603, + "step": 76600 + }, + { + "epoch": 5.012103369316323, + "grad_norm": 0.7275976538658142, + "learning_rate": 6.614023933310813e-05, + "loss": 0.0524, + "step": 76610 + }, + { + "epoch": 5.012757605495584, + "grad_norm": 0.8984965682029724, + "learning_rate": 6.613154488161003e-05, + "loss": 0.0653, + "step": 76620 + }, + { + "epoch": 5.013411841674845, + "grad_norm": 0.8797445893287659, + "learning_rate": 6.612284988562997e-05, + "loss": 0.0669, + "step": 76630 + }, + { + "epoch": 5.014066077854105, + "grad_norm": 0.9222575426101685, + "learning_rate": 6.611415434546147e-05, + "loss": 0.0512, + "step": 76640 + }, + { + "epoch": 5.014720314033366, + "grad_norm": 0.7390499114990234, + "learning_rate": 6.6105458261398e-05, + "loss": 0.0504, + "step": 76650 + }, + { + "epoch": 5.015374550212627, + "grad_norm": 0.9179421663284302, + "learning_rate": 6.609676163373306e-05, + "loss": 0.0605, + "step": 76660 + }, + { + "epoch": 5.016028786391887, + "grad_norm": 0.7605708837509155, + "learning_rate": 6.608806446276021e-05, + "loss": 0.0527, + "step": 76670 + }, + { + "epoch": 5.016683022571148, + "grad_norm": 0.8641387820243835, + "learning_rate": 6.6079366748773e-05, + "loss": 0.0592, + "step": 76680 + }, + { + "epoch": 5.017337258750409, + "grad_norm": 0.8649716377258301, + "learning_rate": 6.607066849206498e-05, + "loss": 0.0731, + "step": 76690 + }, + { + "epoch": 5.01799149492967, + "grad_norm": 0.8245428800582886, + "learning_rate": 6.606196969292974e-05, + "loss": 0.0621, + "step": 76700 + }, + { + "epoch": 5.01864573110893, + "grad_norm": 1.014141321182251, + "learning_rate": 6.605327035166091e-05, + "loss": 0.0639, + "step": 76710 + }, + { + "epoch": 5.019299967288191, + "grad_norm": 0.9020232558250427, + "learning_rate": 6.604457046855212e-05, + "loss": 0.0535, + "step": 76720 + }, + { + "epoch": 5.019954203467452, + "grad_norm": 0.9599230885505676, + "learning_rate": 6.603587004389697e-05, + "loss": 0.0541, + "step": 76730 + }, + { + "epoch": 5.020608439646712, + "grad_norm": 0.9951395988464355, + "learning_rate": 6.602716907798917e-05, + "loss": 0.0624, + "step": 76740 + }, + { + "epoch": 5.021262675825973, + "grad_norm": 0.8179054260253906, + "learning_rate": 6.601846757112238e-05, + "loss": 0.053, + "step": 76750 + }, + { + "epoch": 5.021916912005234, + "grad_norm": 0.8312829732894897, + "learning_rate": 6.600976552359029e-05, + "loss": 0.0579, + "step": 76760 + }, + { + "epoch": 5.022571148184495, + "grad_norm": 0.8234156370162964, + "learning_rate": 6.600106293568663e-05, + "loss": 0.0565, + "step": 76770 + }, + { + "epoch": 5.023225384363755, + "grad_norm": 0.7905632853507996, + "learning_rate": 6.599235980770514e-05, + "loss": 0.0578, + "step": 76780 + }, + { + "epoch": 5.023879620543016, + "grad_norm": 0.83710116147995, + "learning_rate": 6.598365613993956e-05, + "loss": 0.058, + "step": 76790 + }, + { + "epoch": 5.024533856722277, + "grad_norm": 1.039841890335083, + "learning_rate": 6.597495193268366e-05, + "loss": 0.0648, + "step": 76800 + }, + { + "epoch": 5.025188092901537, + "grad_norm": 1.0208362340927124, + "learning_rate": 6.596624718623124e-05, + "loss": 0.063, + "step": 76810 + }, + { + "epoch": 5.025842329080798, + "grad_norm": 0.7998032569885254, + "learning_rate": 6.59575419008761e-05, + "loss": 0.0638, + "step": 76820 + }, + { + "epoch": 5.026496565260059, + "grad_norm": 0.8299915790557861, + "learning_rate": 6.594883607691209e-05, + "loss": 0.0595, + "step": 76830 + }, + { + "epoch": 5.02715080143932, + "grad_norm": 0.8311706185340881, + "learning_rate": 6.594012971463302e-05, + "loss": 0.0569, + "step": 76840 + }, + { + "epoch": 5.02780503761858, + "grad_norm": 1.0256335735321045, + "learning_rate": 6.593142281433277e-05, + "loss": 0.0599, + "step": 76850 + }, + { + "epoch": 5.028459273797841, + "grad_norm": 0.8700340390205383, + "learning_rate": 6.592271537630521e-05, + "loss": 0.0632, + "step": 76860 + }, + { + "epoch": 5.029113509977102, + "grad_norm": 0.7327423095703125, + "learning_rate": 6.591400740084425e-05, + "loss": 0.0572, + "step": 76870 + }, + { + "epoch": 5.029767746156362, + "grad_norm": 0.907089352607727, + "learning_rate": 6.590529888824381e-05, + "loss": 0.0511, + "step": 76880 + }, + { + "epoch": 5.030421982335623, + "grad_norm": 1.0830025672912598, + "learning_rate": 6.589658983879782e-05, + "loss": 0.0629, + "step": 76890 + }, + { + "epoch": 5.031076218514884, + "grad_norm": 0.7621312737464905, + "learning_rate": 6.588788025280022e-05, + "loss": 0.0578, + "step": 76900 + }, + { + "epoch": 5.031730454694144, + "grad_norm": 0.8991169929504395, + "learning_rate": 6.587917013054503e-05, + "loss": 0.0497, + "step": 76910 + }, + { + "epoch": 5.032384690873405, + "grad_norm": 1.5194605588912964, + "learning_rate": 6.587045947232616e-05, + "loss": 0.0594, + "step": 76920 + }, + { + "epoch": 5.033038927052666, + "grad_norm": 1.2414143085479736, + "learning_rate": 6.586174827843768e-05, + "loss": 0.0645, + "step": 76930 + }, + { + "epoch": 5.033693163231927, + "grad_norm": 1.0320972204208374, + "learning_rate": 6.58530365491736e-05, + "loss": 0.0553, + "step": 76940 + }, + { + "epoch": 5.034347399411187, + "grad_norm": 0.9476144313812256, + "learning_rate": 6.584432428482797e-05, + "loss": 0.0631, + "step": 76950 + }, + { + "epoch": 5.035001635590448, + "grad_norm": 1.1414525508880615, + "learning_rate": 6.583561148569481e-05, + "loss": 0.0699, + "step": 76960 + }, + { + "epoch": 5.035655871769709, + "grad_norm": 0.8667239546775818, + "learning_rate": 6.582689815206825e-05, + "loss": 0.0637, + "step": 76970 + }, + { + "epoch": 5.036310107948969, + "grad_norm": 0.6989625692367554, + "learning_rate": 6.581818428424238e-05, + "loss": 0.0546, + "step": 76980 + }, + { + "epoch": 5.03696434412823, + "grad_norm": 0.7704555988311768, + "learning_rate": 6.580946988251128e-05, + "loss": 0.0492, + "step": 76990 + }, + { + "epoch": 5.037618580307491, + "grad_norm": 1.1614646911621094, + "learning_rate": 6.580075494716912e-05, + "loss": 0.0721, + "step": 77000 + }, + { + "epoch": 5.038272816486752, + "grad_norm": 1.0777459144592285, + "learning_rate": 6.579203947851006e-05, + "loss": 0.0586, + "step": 77010 + }, + { + "epoch": 5.038927052666012, + "grad_norm": 0.9171910285949707, + "learning_rate": 6.578332347682824e-05, + "loss": 0.0592, + "step": 77020 + }, + { + "epoch": 5.039581288845273, + "grad_norm": 0.9307588934898376, + "learning_rate": 6.577460694241784e-05, + "loss": 0.0551, + "step": 77030 + }, + { + "epoch": 5.040235525024534, + "grad_norm": 0.7374948263168335, + "learning_rate": 6.576588987557312e-05, + "loss": 0.0557, + "step": 77040 + }, + { + "epoch": 5.040889761203794, + "grad_norm": 0.7271601557731628, + "learning_rate": 6.575717227658825e-05, + "loss": 0.057, + "step": 77050 + }, + { + "epoch": 5.041543997383055, + "grad_norm": 0.8527136445045471, + "learning_rate": 6.57484541457575e-05, + "loss": 0.0569, + "step": 77060 + }, + { + "epoch": 5.042198233562316, + "grad_norm": 0.8172820806503296, + "learning_rate": 6.57397354833751e-05, + "loss": 0.059, + "step": 77070 + }, + { + "epoch": 5.042852469741577, + "grad_norm": 0.8320928812026978, + "learning_rate": 6.573101628973537e-05, + "loss": 0.0557, + "step": 77080 + }, + { + "epoch": 5.043506705920837, + "grad_norm": 0.8559585213661194, + "learning_rate": 6.572229656513258e-05, + "loss": 0.0589, + "step": 77090 + }, + { + "epoch": 5.044160942100098, + "grad_norm": 0.797950267791748, + "learning_rate": 6.571357630986104e-05, + "loss": 0.0525, + "step": 77100 + }, + { + "epoch": 5.044815178279359, + "grad_norm": 0.9195685982704163, + "learning_rate": 6.570485552421509e-05, + "loss": 0.0576, + "step": 77110 + }, + { + "epoch": 5.045469414458619, + "grad_norm": 0.6936109066009521, + "learning_rate": 6.569613420848908e-05, + "loss": 0.0578, + "step": 77120 + }, + { + "epoch": 5.04612365063788, + "grad_norm": 0.9056085348129272, + "learning_rate": 6.568741236297738e-05, + "loss": 0.0642, + "step": 77130 + }, + { + "epoch": 5.046777886817141, + "grad_norm": 0.9640796184539795, + "learning_rate": 6.567868998797438e-05, + "loss": 0.0651, + "step": 77140 + }, + { + "epoch": 5.047432122996402, + "grad_norm": 0.8185075521469116, + "learning_rate": 6.566996708377444e-05, + "loss": 0.0585, + "step": 77150 + }, + { + "epoch": 5.048086359175662, + "grad_norm": 0.7742406129837036, + "learning_rate": 6.566124365067203e-05, + "loss": 0.0685, + "step": 77160 + }, + { + "epoch": 5.048740595354923, + "grad_norm": 0.9033831357955933, + "learning_rate": 6.56525196889616e-05, + "loss": 0.0545, + "step": 77170 + }, + { + "epoch": 5.049394831534184, + "grad_norm": 1.0280879735946655, + "learning_rate": 6.564379519893756e-05, + "loss": 0.0575, + "step": 77180 + }, + { + "epoch": 5.050049067713444, + "grad_norm": 0.8477870225906372, + "learning_rate": 6.56350701808944e-05, + "loss": 0.0596, + "step": 77190 + }, + { + "epoch": 5.050703303892705, + "grad_norm": 0.7754467129707336, + "learning_rate": 6.562634463512663e-05, + "loss": 0.057, + "step": 77200 + }, + { + "epoch": 5.051357540071966, + "grad_norm": 0.9043933749198914, + "learning_rate": 6.561761856192873e-05, + "loss": 0.0679, + "step": 77210 + }, + { + "epoch": 5.052011776251227, + "grad_norm": 0.8377389311790466, + "learning_rate": 6.560889196159525e-05, + "loss": 0.0571, + "step": 77220 + }, + { + "epoch": 5.052666012430487, + "grad_norm": 0.8205317854881287, + "learning_rate": 6.560016483442075e-05, + "loss": 0.0512, + "step": 77230 + }, + { + "epoch": 5.053320248609748, + "grad_norm": 0.6298996806144714, + "learning_rate": 6.559143718069977e-05, + "loss": 0.0653, + "step": 77240 + }, + { + "epoch": 5.053974484789009, + "grad_norm": 1.1496926546096802, + "learning_rate": 6.558270900072687e-05, + "loss": 0.0769, + "step": 77250 + }, + { + "epoch": 5.054628720968269, + "grad_norm": 1.5552153587341309, + "learning_rate": 6.557398029479669e-05, + "loss": 0.0614, + "step": 77260 + }, + { + "epoch": 5.05528295714753, + "grad_norm": 0.9075239300727844, + "learning_rate": 6.556525106320382e-05, + "loss": 0.0637, + "step": 77270 + }, + { + "epoch": 5.055937193326791, + "grad_norm": 0.866381049156189, + "learning_rate": 6.555652130624292e-05, + "loss": 0.0659, + "step": 77280 + }, + { + "epoch": 5.056591429506052, + "grad_norm": 0.8625958561897278, + "learning_rate": 6.554779102420863e-05, + "loss": 0.0547, + "step": 77290 + }, + { + "epoch": 5.057245665685312, + "grad_norm": 0.7577462196350098, + "learning_rate": 6.55390602173956e-05, + "loss": 0.0527, + "step": 77300 + }, + { + "epoch": 5.057899901864573, + "grad_norm": 0.868981659412384, + "learning_rate": 6.553032888609856e-05, + "loss": 0.0541, + "step": 77310 + }, + { + "epoch": 5.058554138043834, + "grad_norm": 1.144863486289978, + "learning_rate": 6.552159703061216e-05, + "loss": 0.0578, + "step": 77320 + }, + { + "epoch": 5.059208374223094, + "grad_norm": 0.9633135795593262, + "learning_rate": 6.551286465123118e-05, + "loss": 0.0614, + "step": 77330 + }, + { + "epoch": 5.059862610402355, + "grad_norm": 0.9470215439796448, + "learning_rate": 6.55041317482503e-05, + "loss": 0.0568, + "step": 77340 + }, + { + "epoch": 5.060516846581616, + "grad_norm": 1.0022468566894531, + "learning_rate": 6.549539832196436e-05, + "loss": 0.0624, + "step": 77350 + }, + { + "epoch": 5.061171082760876, + "grad_norm": 0.7574188113212585, + "learning_rate": 6.548666437266806e-05, + "loss": 0.0468, + "step": 77360 + }, + { + "epoch": 5.061825318940137, + "grad_norm": 0.8999849557876587, + "learning_rate": 6.547792990065622e-05, + "loss": 0.0618, + "step": 77370 + }, + { + "epoch": 5.062479555119398, + "grad_norm": 0.6741798520088196, + "learning_rate": 6.546919490622365e-05, + "loss": 0.0581, + "step": 77380 + }, + { + "epoch": 5.063133791298659, + "grad_norm": 0.9915578365325928, + "learning_rate": 6.546045938966518e-05, + "loss": 0.0584, + "step": 77390 + }, + { + "epoch": 5.063788027477919, + "grad_norm": 0.8000921607017517, + "learning_rate": 6.545172335127568e-05, + "loss": 0.0561, + "step": 77400 + }, + { + "epoch": 5.06444226365718, + "grad_norm": 1.028486728668213, + "learning_rate": 6.544298679134998e-05, + "loss": 0.0495, + "step": 77410 + }, + { + "epoch": 5.065096499836441, + "grad_norm": 0.7525604367256165, + "learning_rate": 6.543424971018298e-05, + "loss": 0.0567, + "step": 77420 + }, + { + "epoch": 5.065750736015701, + "grad_norm": 1.2675774097442627, + "learning_rate": 6.542551210806959e-05, + "loss": 0.0497, + "step": 77430 + }, + { + "epoch": 5.066404972194962, + "grad_norm": 0.8450837731361389, + "learning_rate": 6.541677398530468e-05, + "loss": 0.0589, + "step": 77440 + }, + { + "epoch": 5.067059208374223, + "grad_norm": 0.6929242610931396, + "learning_rate": 6.540803534218322e-05, + "loss": 0.0615, + "step": 77450 + }, + { + "epoch": 5.067713444553484, + "grad_norm": 0.9417129158973694, + "learning_rate": 6.539929617900019e-05, + "loss": 0.0595, + "step": 77460 + }, + { + "epoch": 5.068367680732744, + "grad_norm": 0.944251537322998, + "learning_rate": 6.53905564960505e-05, + "loss": 0.0538, + "step": 77470 + }, + { + "epoch": 5.069021916912005, + "grad_norm": 0.9781545400619507, + "learning_rate": 6.538181629362916e-05, + "loss": 0.0592, + "step": 77480 + }, + { + "epoch": 5.069676153091266, + "grad_norm": 0.9492000937461853, + "learning_rate": 6.537307557203119e-05, + "loss": 0.0644, + "step": 77490 + }, + { + "epoch": 5.070330389270526, + "grad_norm": 0.7973672151565552, + "learning_rate": 6.536433433155161e-05, + "loss": 0.0495, + "step": 77500 + }, + { + "epoch": 5.070984625449787, + "grad_norm": 0.8644813895225525, + "learning_rate": 6.535559257248545e-05, + "loss": 0.0611, + "step": 77510 + }, + { + "epoch": 5.071638861629048, + "grad_norm": 0.9546951651573181, + "learning_rate": 6.534685029512777e-05, + "loss": 0.061, + "step": 77520 + }, + { + "epoch": 5.072293097808309, + "grad_norm": 0.9337143898010254, + "learning_rate": 6.533810749977363e-05, + "loss": 0.0528, + "step": 77530 + }, + { + "epoch": 5.072947333987569, + "grad_norm": 0.8706703186035156, + "learning_rate": 6.532936418671815e-05, + "loss": 0.0517, + "step": 77540 + }, + { + "epoch": 5.07360157016683, + "grad_norm": 0.788536548614502, + "learning_rate": 6.532062035625641e-05, + "loss": 0.0638, + "step": 77550 + }, + { + "epoch": 5.074255806346091, + "grad_norm": 0.8205485343933105, + "learning_rate": 6.531187600868357e-05, + "loss": 0.0554, + "step": 77560 + }, + { + "epoch": 5.074910042525351, + "grad_norm": 0.8340345621109009, + "learning_rate": 6.530313114429475e-05, + "loss": 0.0518, + "step": 77570 + }, + { + "epoch": 5.075564278704612, + "grad_norm": 0.793782651424408, + "learning_rate": 6.529438576338512e-05, + "loss": 0.052, + "step": 77580 + }, + { + "epoch": 5.076218514883873, + "grad_norm": 0.9006875157356262, + "learning_rate": 6.528563986624987e-05, + "loss": 0.0585, + "step": 77590 + }, + { + "epoch": 5.076872751063134, + "grad_norm": 0.9196233749389648, + "learning_rate": 6.527689345318416e-05, + "loss": 0.0536, + "step": 77600 + }, + { + "epoch": 5.077526987242394, + "grad_norm": 0.987467885017395, + "learning_rate": 6.526814652448325e-05, + "loss": 0.0572, + "step": 77610 + }, + { + "epoch": 5.078181223421655, + "grad_norm": 0.994082510471344, + "learning_rate": 6.525939908044236e-05, + "loss": 0.0542, + "step": 77620 + }, + { + "epoch": 5.078835459600916, + "grad_norm": 1.2005406618118286, + "learning_rate": 6.525065112135672e-05, + "loss": 0.0593, + "step": 77630 + }, + { + "epoch": 5.079489695780176, + "grad_norm": 1.0194610357284546, + "learning_rate": 6.52419026475216e-05, + "loss": 0.0574, + "step": 77640 + }, + { + "epoch": 5.080143931959437, + "grad_norm": 0.9802126884460449, + "learning_rate": 6.52331536592323e-05, + "loss": 0.0615, + "step": 77650 + }, + { + "epoch": 5.080798168138698, + "grad_norm": 0.977064311504364, + "learning_rate": 6.522440415678413e-05, + "loss": 0.0562, + "step": 77660 + }, + { + "epoch": 5.081452404317959, + "grad_norm": 0.8457438945770264, + "learning_rate": 6.521565414047237e-05, + "loss": 0.0578, + "step": 77670 + }, + { + "epoch": 5.082106640497219, + "grad_norm": 0.8083365559577942, + "learning_rate": 6.52069036105924e-05, + "loss": 0.0578, + "step": 77680 + }, + { + "epoch": 5.08276087667648, + "grad_norm": 1.0686533451080322, + "learning_rate": 6.519815256743954e-05, + "loss": 0.0556, + "step": 77690 + }, + { + "epoch": 5.083415112855741, + "grad_norm": 0.9897651672363281, + "learning_rate": 6.518940101130916e-05, + "loss": 0.0556, + "step": 77700 + }, + { + "epoch": 5.084069349035001, + "grad_norm": 0.7918063998222351, + "learning_rate": 6.518064894249667e-05, + "loss": 0.0558, + "step": 77710 + }, + { + "epoch": 5.084723585214262, + "grad_norm": 0.7735630869865417, + "learning_rate": 6.517189636129749e-05, + "loss": 0.0557, + "step": 77720 + }, + { + "epoch": 5.085377821393523, + "grad_norm": 0.947536289691925, + "learning_rate": 6.516314326800698e-05, + "loss": 0.0578, + "step": 77730 + }, + { + "epoch": 5.086032057572784, + "grad_norm": 0.74627685546875, + "learning_rate": 6.515438966292062e-05, + "loss": 0.0529, + "step": 77740 + }, + { + "epoch": 5.086686293752044, + "grad_norm": 1.1361474990844727, + "learning_rate": 6.514563554633388e-05, + "loss": 0.0678, + "step": 77750 + }, + { + "epoch": 5.087340529931305, + "grad_norm": 1.06615149974823, + "learning_rate": 6.513688091854224e-05, + "loss": 0.0656, + "step": 77760 + }, + { + "epoch": 5.087994766110566, + "grad_norm": 0.8696459531784058, + "learning_rate": 6.512812577984114e-05, + "loss": 0.0606, + "step": 77770 + }, + { + "epoch": 5.088649002289826, + "grad_norm": 0.7366641163825989, + "learning_rate": 6.511937013052612e-05, + "loss": 0.0488, + "step": 77780 + }, + { + "epoch": 5.089303238469087, + "grad_norm": 0.7504761815071106, + "learning_rate": 6.511061397089271e-05, + "loss": 0.061, + "step": 77790 + }, + { + "epoch": 5.089957474648348, + "grad_norm": 0.8598735332489014, + "learning_rate": 6.510185730123646e-05, + "loss": 0.0606, + "step": 77800 + }, + { + "epoch": 5.090611710827609, + "grad_norm": 0.8309386968612671, + "learning_rate": 6.50931001218529e-05, + "loss": 0.069, + "step": 77810 + }, + { + "epoch": 5.091265947006869, + "grad_norm": 0.9889697432518005, + "learning_rate": 6.508434243303764e-05, + "loss": 0.0667, + "step": 77820 + }, + { + "epoch": 5.09192018318613, + "grad_norm": 0.9573984742164612, + "learning_rate": 6.507558423508629e-05, + "loss": 0.0594, + "step": 77830 + }, + { + "epoch": 5.092574419365391, + "grad_norm": 1.049492359161377, + "learning_rate": 6.50668255282944e-05, + "loss": 0.0538, + "step": 77840 + }, + { + "epoch": 5.093228655544651, + "grad_norm": 0.8158976435661316, + "learning_rate": 6.505806631295765e-05, + "loss": 0.0592, + "step": 77850 + }, + { + "epoch": 5.093882891723912, + "grad_norm": 0.7729289531707764, + "learning_rate": 6.504930658937165e-05, + "loss": 0.0601, + "step": 77860 + }, + { + "epoch": 5.094537127903173, + "grad_norm": 0.7943597435951233, + "learning_rate": 6.50405463578321e-05, + "loss": 0.0674, + "step": 77870 + }, + { + "epoch": 5.095191364082433, + "grad_norm": 1.0894060134887695, + "learning_rate": 6.503178561863466e-05, + "loss": 0.0594, + "step": 77880 + }, + { + "epoch": 5.095845600261694, + "grad_norm": 0.8474149703979492, + "learning_rate": 6.502302437207504e-05, + "loss": 0.059, + "step": 77890 + }, + { + "epoch": 5.096499836440955, + "grad_norm": 0.9915545582771301, + "learning_rate": 6.501426261844894e-05, + "loss": 0.0529, + "step": 77900 + }, + { + "epoch": 5.097154072620216, + "grad_norm": 1.004508137702942, + "learning_rate": 6.500550035805212e-05, + "loss": 0.0504, + "step": 77910 + }, + { + "epoch": 5.097808308799476, + "grad_norm": 0.861196756362915, + "learning_rate": 6.499673759118028e-05, + "loss": 0.0581, + "step": 77920 + }, + { + "epoch": 5.098462544978737, + "grad_norm": 0.7143073081970215, + "learning_rate": 6.498797431812923e-05, + "loss": 0.057, + "step": 77930 + }, + { + "epoch": 5.099116781157998, + "grad_norm": 0.7340794205665588, + "learning_rate": 6.497921053919475e-05, + "loss": 0.0698, + "step": 77940 + }, + { + "epoch": 5.099771017337258, + "grad_norm": 0.691623866558075, + "learning_rate": 6.497044625467263e-05, + "loss": 0.0555, + "step": 77950 + }, + { + "epoch": 5.100425253516519, + "grad_norm": 1.062201738357544, + "learning_rate": 6.496168146485865e-05, + "loss": 0.0621, + "step": 77960 + }, + { + "epoch": 5.10107948969578, + "grad_norm": 0.9496244192123413, + "learning_rate": 6.495291617004873e-05, + "loss": 0.0566, + "step": 77970 + }, + { + "epoch": 5.101733725875041, + "grad_norm": 0.9271469116210938, + "learning_rate": 6.494415037053865e-05, + "loss": 0.0504, + "step": 77980 + }, + { + "epoch": 5.102387962054301, + "grad_norm": 0.8734414577484131, + "learning_rate": 6.493538406662429e-05, + "loss": 0.0591, + "step": 77990 + }, + { + "epoch": 5.103042198233562, + "grad_norm": 0.9724219441413879, + "learning_rate": 6.492661725860157e-05, + "loss": 0.0653, + "step": 78000 + }, + { + "epoch": 5.103696434412823, + "grad_norm": 0.709010124206543, + "learning_rate": 6.491784994676637e-05, + "loss": 0.0516, + "step": 78010 + }, + { + "epoch": 5.104350670592083, + "grad_norm": 0.7753589749336243, + "learning_rate": 6.490908213141461e-05, + "loss": 0.0519, + "step": 78020 + }, + { + "epoch": 5.105004906771344, + "grad_norm": 0.8125455379486084, + "learning_rate": 6.490031381284221e-05, + "loss": 0.0581, + "step": 78030 + }, + { + "epoch": 5.105659142950605, + "grad_norm": 1.1419060230255127, + "learning_rate": 6.489154499134517e-05, + "loss": 0.0563, + "step": 78040 + }, + { + "epoch": 5.106313379129866, + "grad_norm": 0.945682942867279, + "learning_rate": 6.488277566721941e-05, + "loss": 0.0652, + "step": 78050 + }, + { + "epoch": 5.106967615309126, + "grad_norm": 0.9057697653770447, + "learning_rate": 6.487400584076094e-05, + "loss": 0.0589, + "step": 78060 + }, + { + "epoch": 5.107621851488387, + "grad_norm": 0.9359369874000549, + "learning_rate": 6.486523551226577e-05, + "loss": 0.0612, + "step": 78070 + }, + { + "epoch": 5.108276087667648, + "grad_norm": 0.8022433519363403, + "learning_rate": 6.485646468202993e-05, + "loss": 0.0674, + "step": 78080 + }, + { + "epoch": 5.108930323846908, + "grad_norm": 0.8566710948944092, + "learning_rate": 6.484769335034942e-05, + "loss": 0.0553, + "step": 78090 + }, + { + "epoch": 5.109584560026169, + "grad_norm": 0.9492458701133728, + "learning_rate": 6.483892151752034e-05, + "loss": 0.0629, + "step": 78100 + }, + { + "epoch": 5.11023879620543, + "grad_norm": 0.8819661736488342, + "learning_rate": 6.483014918383873e-05, + "loss": 0.0587, + "step": 78110 + }, + { + "epoch": 5.110893032384691, + "grad_norm": 0.9766054749488831, + "learning_rate": 6.482137634960068e-05, + "loss": 0.0615, + "step": 78120 + }, + { + "epoch": 5.111547268563951, + "grad_norm": 0.8624576926231384, + "learning_rate": 6.481260301510233e-05, + "loss": 0.0602, + "step": 78130 + }, + { + "epoch": 5.112201504743212, + "grad_norm": 0.9257481098175049, + "learning_rate": 6.480382918063978e-05, + "loss": 0.0551, + "step": 78140 + }, + { + "epoch": 5.112855740922473, + "grad_norm": 0.9216415882110596, + "learning_rate": 6.479505484650916e-05, + "loss": 0.0538, + "step": 78150 + }, + { + "epoch": 5.113509977101733, + "grad_norm": 0.7884585857391357, + "learning_rate": 6.478628001300664e-05, + "loss": 0.0556, + "step": 78160 + }, + { + "epoch": 5.114164213280994, + "grad_norm": 0.9612773060798645, + "learning_rate": 6.477750468042841e-05, + "loss": 0.0561, + "step": 78170 + }, + { + "epoch": 5.114818449460255, + "grad_norm": 0.7759575247764587, + "learning_rate": 6.476872884907062e-05, + "loss": 0.0633, + "step": 78180 + }, + { + "epoch": 5.115472685639516, + "grad_norm": 0.9810332655906677, + "learning_rate": 6.475995251922949e-05, + "loss": 0.0571, + "step": 78190 + }, + { + "epoch": 5.116126921818776, + "grad_norm": 0.7260820865631104, + "learning_rate": 6.475117569120127e-05, + "loss": 0.0624, + "step": 78200 + }, + { + "epoch": 5.116781157998037, + "grad_norm": 0.998258113861084, + "learning_rate": 6.474239836528219e-05, + "loss": 0.055, + "step": 78210 + }, + { + "epoch": 5.117435394177298, + "grad_norm": 0.7317970395088196, + "learning_rate": 6.473362054176847e-05, + "loss": 0.0594, + "step": 78220 + }, + { + "epoch": 5.118089630356558, + "grad_norm": 1.0319292545318604, + "learning_rate": 6.472484222095645e-05, + "loss": 0.0558, + "step": 78230 + }, + { + "epoch": 5.118743866535819, + "grad_norm": 0.6902409195899963, + "learning_rate": 6.471606340314238e-05, + "loss": 0.0631, + "step": 78240 + }, + { + "epoch": 5.11939810271508, + "grad_norm": 1.0602803230285645, + "learning_rate": 6.470728408862257e-05, + "loss": 0.053, + "step": 78250 + }, + { + "epoch": 5.120052338894341, + "grad_norm": 0.9925687313079834, + "learning_rate": 6.469850427769336e-05, + "loss": 0.0685, + "step": 78260 + }, + { + "epoch": 5.120706575073601, + "grad_norm": 0.8387590050697327, + "learning_rate": 6.468972397065108e-05, + "loss": 0.0516, + "step": 78270 + }, + { + "epoch": 5.121360811252862, + "grad_norm": 0.93202805519104, + "learning_rate": 6.468094316779207e-05, + "loss": 0.0657, + "step": 78280 + }, + { + "epoch": 5.122015047432123, + "grad_norm": 0.8118649125099182, + "learning_rate": 6.467216186941274e-05, + "loss": 0.0504, + "step": 78290 + }, + { + "epoch": 5.122669283611383, + "grad_norm": 0.8819132447242737, + "learning_rate": 6.466338007580948e-05, + "loss": 0.0667, + "step": 78300 + }, + { + "epoch": 5.123323519790644, + "grad_norm": 1.2045010328292847, + "learning_rate": 6.465459778727867e-05, + "loss": 0.068, + "step": 78310 + }, + { + "epoch": 5.123977755969905, + "grad_norm": 0.8089714646339417, + "learning_rate": 6.464581500411675e-05, + "loss": 0.0599, + "step": 78320 + }, + { + "epoch": 5.124631992149165, + "grad_norm": 0.8597386479377747, + "learning_rate": 6.463703172662019e-05, + "loss": 0.0561, + "step": 78330 + }, + { + "epoch": 5.125286228328426, + "grad_norm": 0.7467419505119324, + "learning_rate": 6.46282479550854e-05, + "loss": 0.0557, + "step": 78340 + }, + { + "epoch": 5.125940464507687, + "grad_norm": 0.8646554946899414, + "learning_rate": 6.461946368980888e-05, + "loss": 0.0532, + "step": 78350 + }, + { + "epoch": 5.126594700686948, + "grad_norm": 0.681583821773529, + "learning_rate": 6.461067893108712e-05, + "loss": 0.0543, + "step": 78360 + }, + { + "epoch": 5.127248936866208, + "grad_norm": 0.9374173879623413, + "learning_rate": 6.460189367921663e-05, + "loss": 0.0547, + "step": 78370 + }, + { + "epoch": 5.127903173045469, + "grad_norm": 0.9243113398551941, + "learning_rate": 6.459310793449391e-05, + "loss": 0.0538, + "step": 78380 + }, + { + "epoch": 5.12855740922473, + "grad_norm": 0.8543265461921692, + "learning_rate": 6.458432169721556e-05, + "loss": 0.0603, + "step": 78390 + }, + { + "epoch": 5.12921164540399, + "grad_norm": 0.811305046081543, + "learning_rate": 6.457553496767809e-05, + "loss": 0.0606, + "step": 78400 + }, + { + "epoch": 5.129865881583251, + "grad_norm": 0.9650918245315552, + "learning_rate": 6.456674774617809e-05, + "loss": 0.0528, + "step": 78410 + }, + { + "epoch": 5.130520117762512, + "grad_norm": 0.989138126373291, + "learning_rate": 6.455796003301215e-05, + "loss": 0.0625, + "step": 78420 + }, + { + "epoch": 5.131174353941773, + "grad_norm": 0.9064369201660156, + "learning_rate": 6.45491718284769e-05, + "loss": 0.0566, + "step": 78430 + }, + { + "epoch": 5.131828590121033, + "grad_norm": 0.7815073728561401, + "learning_rate": 6.454038313286891e-05, + "loss": 0.0654, + "step": 78440 + }, + { + "epoch": 5.132482826300294, + "grad_norm": 0.9094436168670654, + "learning_rate": 6.453159394648487e-05, + "loss": 0.0573, + "step": 78450 + }, + { + "epoch": 5.133137062479555, + "grad_norm": 0.8054584860801697, + "learning_rate": 6.452280426962143e-05, + "loss": 0.054, + "step": 78460 + }, + { + "epoch": 5.1337912986588154, + "grad_norm": 0.8743946552276611, + "learning_rate": 6.451401410257525e-05, + "loss": 0.0594, + "step": 78470 + }, + { + "epoch": 5.134445534838076, + "grad_norm": 0.9614243507385254, + "learning_rate": 6.450522344564303e-05, + "loss": 0.0619, + "step": 78480 + }, + { + "epoch": 5.135099771017337, + "grad_norm": 0.9176281094551086, + "learning_rate": 6.449643229912148e-05, + "loss": 0.0582, + "step": 78490 + }, + { + "epoch": 5.135754007196598, + "grad_norm": 0.9308127164840698, + "learning_rate": 6.448764066330733e-05, + "loss": 0.0574, + "step": 78500 + }, + { + "epoch": 5.136408243375858, + "grad_norm": 0.8228217363357544, + "learning_rate": 6.44788485384973e-05, + "loss": 0.0693, + "step": 78510 + }, + { + "epoch": 5.137062479555119, + "grad_norm": 0.9158545136451721, + "learning_rate": 6.447005592498816e-05, + "loss": 0.0499, + "step": 78520 + }, + { + "epoch": 5.13771671573438, + "grad_norm": 0.8017662763595581, + "learning_rate": 6.446126282307669e-05, + "loss": 0.0561, + "step": 78530 + }, + { + "epoch": 5.1383709519136405, + "grad_norm": 0.8243115544319153, + "learning_rate": 6.445246923305966e-05, + "loss": 0.056, + "step": 78540 + }, + { + "epoch": 5.139025188092901, + "grad_norm": 0.7880877256393433, + "learning_rate": 6.44436751552339e-05, + "loss": 0.0569, + "step": 78550 + }, + { + "epoch": 5.139679424272162, + "grad_norm": 0.9965237379074097, + "learning_rate": 6.443488058989624e-05, + "loss": 0.0646, + "step": 78560 + }, + { + "epoch": 5.140333660451423, + "grad_norm": 0.9175546765327454, + "learning_rate": 6.442608553734348e-05, + "loss": 0.0679, + "step": 78570 + }, + { + "epoch": 5.1409878966306835, + "grad_norm": 0.8487790822982788, + "learning_rate": 6.441728999787251e-05, + "loss": 0.0511, + "step": 78580 + }, + { + "epoch": 5.141642132809944, + "grad_norm": 0.9188631772994995, + "learning_rate": 6.44084939717802e-05, + "loss": 0.0572, + "step": 78590 + }, + { + "epoch": 5.142296368989205, + "grad_norm": 0.8880249857902527, + "learning_rate": 6.439969745936341e-05, + "loss": 0.0729, + "step": 78600 + }, + { + "epoch": 5.1429506051684655, + "grad_norm": 0.7750711441040039, + "learning_rate": 6.439090046091907e-05, + "loss": 0.062, + "step": 78610 + }, + { + "epoch": 5.143604841347726, + "grad_norm": 0.8902852535247803, + "learning_rate": 6.438210297674411e-05, + "loss": 0.0582, + "step": 78620 + }, + { + "epoch": 5.144259077526987, + "grad_norm": 0.9351129531860352, + "learning_rate": 6.437330500713545e-05, + "loss": 0.0535, + "step": 78630 + }, + { + "epoch": 5.144913313706248, + "grad_norm": 0.8082217574119568, + "learning_rate": 6.436450655239004e-05, + "loss": 0.0601, + "step": 78640 + }, + { + "epoch": 5.1455675498855085, + "grad_norm": 0.9158061146736145, + "learning_rate": 6.435570761280487e-05, + "loss": 0.0562, + "step": 78650 + }, + { + "epoch": 5.146221786064769, + "grad_norm": 0.8845149278640747, + "learning_rate": 6.434690818867693e-05, + "loss": 0.0515, + "step": 78660 + }, + { + "epoch": 5.14687602224403, + "grad_norm": 0.9246596693992615, + "learning_rate": 6.43381082803032e-05, + "loss": 0.0547, + "step": 78670 + }, + { + "epoch": 5.1475302584232905, + "grad_norm": 0.8602831363677979, + "learning_rate": 6.432930788798072e-05, + "loss": 0.0475, + "step": 78680 + }, + { + "epoch": 5.1481844946025515, + "grad_norm": 0.7768710255622864, + "learning_rate": 6.432050701200651e-05, + "loss": 0.0519, + "step": 78690 + }, + { + "epoch": 5.148838730781812, + "grad_norm": 0.8790880441665649, + "learning_rate": 6.431170565267764e-05, + "loss": 0.0609, + "step": 78700 + }, + { + "epoch": 5.149492966961073, + "grad_norm": 1.016045331954956, + "learning_rate": 6.430290381029116e-05, + "loss": 0.0583, + "step": 78710 + }, + { + "epoch": 5.1501472031403335, + "grad_norm": 1.148781418800354, + "learning_rate": 6.429410148514419e-05, + "loss": 0.0667, + "step": 78720 + }, + { + "epoch": 5.150801439319594, + "grad_norm": 0.873008668422699, + "learning_rate": 6.428529867753377e-05, + "loss": 0.0617, + "step": 78730 + }, + { + "epoch": 5.151455675498855, + "grad_norm": 0.7208111882209778, + "learning_rate": 6.427649538775708e-05, + "loss": 0.0633, + "step": 78740 + }, + { + "epoch": 5.1521099116781155, + "grad_norm": 1.1463489532470703, + "learning_rate": 6.426769161611121e-05, + "loss": 0.056, + "step": 78750 + }, + { + "epoch": 5.1527641478573765, + "grad_norm": 0.7651934623718262, + "learning_rate": 6.425888736289336e-05, + "loss": 0.0522, + "step": 78760 + }, + { + "epoch": 5.153418384036637, + "grad_norm": 0.9302027821540833, + "learning_rate": 6.425008262840064e-05, + "loss": 0.063, + "step": 78770 + }, + { + "epoch": 5.1540726202158975, + "grad_norm": 0.8309484720230103, + "learning_rate": 6.424127741293027e-05, + "loss": 0.0663, + "step": 78780 + }, + { + "epoch": 5.1547268563951585, + "grad_norm": 0.8712273240089417, + "learning_rate": 6.423247171677943e-05, + "loss": 0.0648, + "step": 78790 + }, + { + "epoch": 5.1553810925744195, + "grad_norm": 0.9975765943527222, + "learning_rate": 6.422366554024536e-05, + "loss": 0.0636, + "step": 78800 + }, + { + "epoch": 5.15603532875368, + "grad_norm": 1.0808351039886475, + "learning_rate": 6.421485888362526e-05, + "loss": 0.0615, + "step": 78810 + }, + { + "epoch": 5.1566895649329405, + "grad_norm": 0.7145622968673706, + "learning_rate": 6.42060517472164e-05, + "loss": 0.0601, + "step": 78820 + }, + { + "epoch": 5.1573438011122015, + "grad_norm": 0.8157432675361633, + "learning_rate": 6.419724413131604e-05, + "loss": 0.0544, + "step": 78830 + }, + { + "epoch": 5.1579980372914624, + "grad_norm": 0.8889109492301941, + "learning_rate": 6.418843603622144e-05, + "loss": 0.0558, + "step": 78840 + }, + { + "epoch": 5.1586522734707225, + "grad_norm": 0.8644919991493225, + "learning_rate": 6.417962746222993e-05, + "loss": 0.057, + "step": 78850 + }, + { + "epoch": 5.1593065096499835, + "grad_norm": 0.9862198233604431, + "learning_rate": 6.41708184096388e-05, + "loss": 0.0577, + "step": 78860 + }, + { + "epoch": 5.1599607458292445, + "grad_norm": 0.806100606918335, + "learning_rate": 6.41620088787454e-05, + "loss": 0.0588, + "step": 78870 + }, + { + "epoch": 5.160614982008505, + "grad_norm": 0.9563096165657043, + "learning_rate": 6.415319886984703e-05, + "loss": 0.0603, + "step": 78880 + }, + { + "epoch": 5.1612692181877655, + "grad_norm": 0.9669555425643921, + "learning_rate": 6.41443883832411e-05, + "loss": 0.0509, + "step": 78890 + }, + { + "epoch": 5.1619234543670265, + "grad_norm": 0.8312886357307434, + "learning_rate": 6.413557741922495e-05, + "loss": 0.056, + "step": 78900 + }, + { + "epoch": 5.1625776905462875, + "grad_norm": 0.8751150369644165, + "learning_rate": 6.412676597809602e-05, + "loss": 0.0561, + "step": 78910 + }, + { + "epoch": 5.1632319267255475, + "grad_norm": 0.9386425018310547, + "learning_rate": 6.411795406015166e-05, + "loss": 0.0561, + "step": 78920 + }, + { + "epoch": 5.1638861629048085, + "grad_norm": 1.336055874824524, + "learning_rate": 6.410914166568933e-05, + "loss": 0.0501, + "step": 78930 + }, + { + "epoch": 5.1645403990840695, + "grad_norm": 0.7663227915763855, + "learning_rate": 6.410032879500647e-05, + "loss": 0.0583, + "step": 78940 + }, + { + "epoch": 5.1651946352633304, + "grad_norm": 0.8238208889961243, + "learning_rate": 6.409151544840055e-05, + "loss": 0.0637, + "step": 78950 + }, + { + "epoch": 5.1658488714425905, + "grad_norm": 0.8930804133415222, + "learning_rate": 6.4082701626169e-05, + "loss": 0.0519, + "step": 78960 + }, + { + "epoch": 5.1665031076218515, + "grad_norm": 0.7918708324432373, + "learning_rate": 6.407388732860935e-05, + "loss": 0.0553, + "step": 78970 + }, + { + "epoch": 5.1671573438011125, + "grad_norm": 0.7444891333580017, + "learning_rate": 6.40650725560191e-05, + "loss": 0.0473, + "step": 78980 + }, + { + "epoch": 5.1678115799803725, + "grad_norm": 0.9502220749855042, + "learning_rate": 6.405625730869575e-05, + "loss": 0.057, + "step": 78990 + }, + { + "epoch": 5.1684658161596335, + "grad_norm": 0.8537470698356628, + "learning_rate": 6.404744158693685e-05, + "loss": 0.0563, + "step": 79000 + }, + { + "epoch": 5.1691200523388945, + "grad_norm": 0.902565062046051, + "learning_rate": 6.403862539103998e-05, + "loss": 0.0536, + "step": 79010 + }, + { + "epoch": 5.1697742885181555, + "grad_norm": 0.9885287284851074, + "learning_rate": 6.402980872130266e-05, + "loss": 0.0589, + "step": 79020 + }, + { + "epoch": 5.1704285246974155, + "grad_norm": 0.8051989674568176, + "learning_rate": 6.402099157802252e-05, + "loss": 0.0661, + "step": 79030 + }, + { + "epoch": 5.1710827608766765, + "grad_norm": 1.0724586248397827, + "learning_rate": 6.401217396149713e-05, + "loss": 0.0528, + "step": 79040 + }, + { + "epoch": 5.1717369970559375, + "grad_norm": 0.9905614256858826, + "learning_rate": 6.400335587202413e-05, + "loss": 0.0511, + "step": 79050 + }, + { + "epoch": 5.172391233235198, + "grad_norm": 0.8915742635726929, + "learning_rate": 6.399453730990113e-05, + "loss": 0.0595, + "step": 79060 + }, + { + "epoch": 5.1730454694144585, + "grad_norm": 0.8688280582427979, + "learning_rate": 6.398571827542581e-05, + "loss": 0.0592, + "step": 79070 + }, + { + "epoch": 5.1736997055937195, + "grad_norm": 0.9881342053413391, + "learning_rate": 6.39768987688958e-05, + "loss": 0.0612, + "step": 79080 + }, + { + "epoch": 5.1743539417729805, + "grad_norm": 0.8909708261489868, + "learning_rate": 6.396807879060882e-05, + "loss": 0.0552, + "step": 79090 + }, + { + "epoch": 5.1750081779522406, + "grad_norm": 1.0886934995651245, + "learning_rate": 6.395925834086254e-05, + "loss": 0.0531, + "step": 79100 + }, + { + "epoch": 5.1756624141315015, + "grad_norm": 0.8756330609321594, + "learning_rate": 6.395043741995468e-05, + "loss": 0.0536, + "step": 79110 + }, + { + "epoch": 5.1763166503107625, + "grad_norm": 0.8764585852622986, + "learning_rate": 6.394161602818296e-05, + "loss": 0.0525, + "step": 79120 + }, + { + "epoch": 5.176970886490023, + "grad_norm": 0.8468575477600098, + "learning_rate": 6.393279416584516e-05, + "loss": 0.0644, + "step": 79130 + }, + { + "epoch": 5.1776251226692835, + "grad_norm": 0.982122540473938, + "learning_rate": 6.392397183323901e-05, + "loss": 0.0528, + "step": 79140 + }, + { + "epoch": 5.1782793588485445, + "grad_norm": 0.9616347551345825, + "learning_rate": 6.391514903066228e-05, + "loss": 0.0526, + "step": 79150 + }, + { + "epoch": 5.1789335950278055, + "grad_norm": 0.8660449981689453, + "learning_rate": 6.390632575841278e-05, + "loss": 0.0643, + "step": 79160 + }, + { + "epoch": 5.179587831207066, + "grad_norm": 0.7441914081573486, + "learning_rate": 6.389750201678833e-05, + "loss": 0.0664, + "step": 79170 + }, + { + "epoch": 5.1802420673863265, + "grad_norm": 0.796704888343811, + "learning_rate": 6.388867780608672e-05, + "loss": 0.0534, + "step": 79180 + }, + { + "epoch": 5.1808963035655875, + "grad_norm": 0.6812049746513367, + "learning_rate": 6.387985312660582e-05, + "loss": 0.0643, + "step": 79190 + }, + { + "epoch": 5.181550539744848, + "grad_norm": 0.8211969137191772, + "learning_rate": 6.387102797864349e-05, + "loss": 0.0599, + "step": 79200 + }, + { + "epoch": 5.1822047759241086, + "grad_norm": 1.1284834146499634, + "learning_rate": 6.386220236249757e-05, + "loss": 0.0679, + "step": 79210 + }, + { + "epoch": 5.1828590121033695, + "grad_norm": 0.8889693021774292, + "learning_rate": 6.385337627846596e-05, + "loss": 0.0503, + "step": 79220 + }, + { + "epoch": 5.18351324828263, + "grad_norm": 0.8759040236473083, + "learning_rate": 6.384454972684658e-05, + "loss": 0.0576, + "step": 79230 + }, + { + "epoch": 5.184167484461891, + "grad_norm": 0.808881938457489, + "learning_rate": 6.383572270793733e-05, + "loss": 0.0624, + "step": 79240 + }, + { + "epoch": 5.1848217206411515, + "grad_norm": 0.8137565851211548, + "learning_rate": 6.382689522203616e-05, + "loss": 0.0621, + "step": 79250 + }, + { + "epoch": 5.1854759568204125, + "grad_norm": 1.0435824394226074, + "learning_rate": 6.381806726944101e-05, + "loss": 0.0673, + "step": 79260 + }, + { + "epoch": 5.186130192999673, + "grad_norm": 0.6479164361953735, + "learning_rate": 6.380923885044985e-05, + "loss": 0.0537, + "step": 79270 + }, + { + "epoch": 5.186784429178934, + "grad_norm": 0.784147322177887, + "learning_rate": 6.380040996536067e-05, + "loss": 0.0504, + "step": 79280 + }, + { + "epoch": 5.1874386653581945, + "grad_norm": 0.8886092901229858, + "learning_rate": 6.379158061447145e-05, + "loss": 0.0666, + "step": 79290 + }, + { + "epoch": 5.188092901537455, + "grad_norm": 0.8956160545349121, + "learning_rate": 6.378275079808022e-05, + "loss": 0.0645, + "step": 79300 + }, + { + "epoch": 5.188747137716716, + "grad_norm": 0.7196115851402283, + "learning_rate": 6.3773920516485e-05, + "loss": 0.0659, + "step": 79310 + }, + { + "epoch": 5.1894013738959766, + "grad_norm": 0.9113817811012268, + "learning_rate": 6.376508976998386e-05, + "loss": 0.0635, + "step": 79320 + }, + { + "epoch": 5.1900556100752375, + "grad_norm": 1.1962367296218872, + "learning_rate": 6.375625855887481e-05, + "loss": 0.0569, + "step": 79330 + }, + { + "epoch": 5.190709846254498, + "grad_norm": 0.8185920715332031, + "learning_rate": 6.374742688345598e-05, + "loss": 0.0622, + "step": 79340 + }, + { + "epoch": 5.191364082433759, + "grad_norm": 0.9433519244194031, + "learning_rate": 6.373859474402542e-05, + "loss": 0.0622, + "step": 79350 + }, + { + "epoch": 5.1920183186130195, + "grad_norm": 0.8798799514770508, + "learning_rate": 6.372976214088127e-05, + "loss": 0.0532, + "step": 79360 + }, + { + "epoch": 5.19267255479228, + "grad_norm": 0.9233197569847107, + "learning_rate": 6.372092907432163e-05, + "loss": 0.0551, + "step": 79370 + }, + { + "epoch": 5.193326790971541, + "grad_norm": 1.0525169372558594, + "learning_rate": 6.371209554464465e-05, + "loss": 0.0567, + "step": 79380 + }, + { + "epoch": 5.193981027150802, + "grad_norm": 0.9267981648445129, + "learning_rate": 6.37032615521485e-05, + "loss": 0.0595, + "step": 79390 + }, + { + "epoch": 5.1946352633300625, + "grad_norm": 0.8152212500572205, + "learning_rate": 6.369442709713132e-05, + "loss": 0.0625, + "step": 79400 + }, + { + "epoch": 5.195289499509323, + "grad_norm": 0.7746557593345642, + "learning_rate": 6.36855921798913e-05, + "loss": 0.0582, + "step": 79410 + }, + { + "epoch": 5.195943735688584, + "grad_norm": 0.8332052826881409, + "learning_rate": 6.367675680072668e-05, + "loss": 0.061, + "step": 79420 + }, + { + "epoch": 5.196597971867845, + "grad_norm": 0.7936050295829773, + "learning_rate": 6.366792095993563e-05, + "loss": 0.0552, + "step": 79430 + }, + { + "epoch": 5.197252208047105, + "grad_norm": 0.8946577906608582, + "learning_rate": 6.365908465781641e-05, + "loss": 0.063, + "step": 79440 + }, + { + "epoch": 5.197906444226366, + "grad_norm": 1.0016376972198486, + "learning_rate": 6.365024789466725e-05, + "loss": 0.0655, + "step": 79450 + }, + { + "epoch": 5.198560680405627, + "grad_norm": 0.8920219540596008, + "learning_rate": 6.364141067078645e-05, + "loss": 0.0636, + "step": 79460 + }, + { + "epoch": 5.1992149165848875, + "grad_norm": 0.9238928556442261, + "learning_rate": 6.363257298647224e-05, + "loss": 0.0572, + "step": 79470 + }, + { + "epoch": 5.199869152764148, + "grad_norm": 0.8631906509399414, + "learning_rate": 6.362373484202294e-05, + "loss": 0.0568, + "step": 79480 + }, + { + "epoch": 5.200523388943409, + "grad_norm": 0.8432829976081848, + "learning_rate": 6.361489623773686e-05, + "loss": 0.0616, + "step": 79490 + }, + { + "epoch": 5.20117762512267, + "grad_norm": 0.8368816375732422, + "learning_rate": 6.360605717391235e-05, + "loss": 0.0518, + "step": 79500 + }, + { + "epoch": 5.20183186130193, + "grad_norm": 0.8626534342765808, + "learning_rate": 6.35972176508477e-05, + "loss": 0.0623, + "step": 79510 + }, + { + "epoch": 5.202486097481191, + "grad_norm": 0.7735602259635925, + "learning_rate": 6.35883776688413e-05, + "loss": 0.0494, + "step": 79520 + }, + { + "epoch": 5.203140333660452, + "grad_norm": 0.8893035650253296, + "learning_rate": 6.357953722819151e-05, + "loss": 0.0573, + "step": 79530 + }, + { + "epoch": 5.203794569839713, + "grad_norm": 0.8206644058227539, + "learning_rate": 6.357069632919674e-05, + "loss": 0.0615, + "step": 79540 + }, + { + "epoch": 5.204448806018973, + "grad_norm": 1.0286612510681152, + "learning_rate": 6.356185497215537e-05, + "loss": 0.0694, + "step": 79550 + }, + { + "epoch": 5.205103042198234, + "grad_norm": 1.0252892971038818, + "learning_rate": 6.355301315736583e-05, + "loss": 0.0615, + "step": 79560 + }, + { + "epoch": 5.205757278377495, + "grad_norm": 0.8636437058448792, + "learning_rate": 6.354417088512655e-05, + "loss": 0.0534, + "step": 79570 + }, + { + "epoch": 5.206411514556755, + "grad_norm": 0.9313245415687561, + "learning_rate": 6.353532815573599e-05, + "loss": 0.054, + "step": 79580 + }, + { + "epoch": 5.207065750736016, + "grad_norm": 0.9524334073066711, + "learning_rate": 6.35264849694926e-05, + "loss": 0.0564, + "step": 79590 + }, + { + "epoch": 5.207719986915277, + "grad_norm": 0.7033629417419434, + "learning_rate": 6.351764132669486e-05, + "loss": 0.0526, + "step": 79600 + }, + { + "epoch": 5.208374223094538, + "grad_norm": 0.8188180923461914, + "learning_rate": 6.350879722764129e-05, + "loss": 0.0498, + "step": 79610 + }, + { + "epoch": 5.209028459273798, + "grad_norm": 0.8515734672546387, + "learning_rate": 6.349995267263038e-05, + "loss": 0.0608, + "step": 79620 + }, + { + "epoch": 5.209682695453059, + "grad_norm": 0.7757560014724731, + "learning_rate": 6.349110766196065e-05, + "loss": 0.0531, + "step": 79630 + }, + { + "epoch": 5.21033693163232, + "grad_norm": 0.9493059515953064, + "learning_rate": 6.348226219593066e-05, + "loss": 0.0522, + "step": 79640 + }, + { + "epoch": 5.21099116781158, + "grad_norm": 0.7367783188819885, + "learning_rate": 6.347341627483897e-05, + "loss": 0.0652, + "step": 79650 + }, + { + "epoch": 5.211645403990841, + "grad_norm": 0.8916527628898621, + "learning_rate": 6.346456989898415e-05, + "loss": 0.0641, + "step": 79660 + }, + { + "epoch": 5.212299640170102, + "grad_norm": 0.9219633936882019, + "learning_rate": 6.345572306866477e-05, + "loss": 0.0557, + "step": 79670 + }, + { + "epoch": 5.212953876349362, + "grad_norm": 0.934571385383606, + "learning_rate": 6.344687578417945e-05, + "loss": 0.0571, + "step": 79680 + }, + { + "epoch": 5.213608112528623, + "grad_norm": 0.8010873198509216, + "learning_rate": 6.343802804582681e-05, + "loss": 0.0538, + "step": 79690 + }, + { + "epoch": 5.214262348707884, + "grad_norm": 1.0014433860778809, + "learning_rate": 6.342917985390548e-05, + "loss": 0.0588, + "step": 79700 + }, + { + "epoch": 5.214916584887145, + "grad_norm": 0.7398951649665833, + "learning_rate": 6.342033120871411e-05, + "loss": 0.0599, + "step": 79710 + }, + { + "epoch": 5.215570821066405, + "grad_norm": 0.9867905974388123, + "learning_rate": 6.341148211055138e-05, + "loss": 0.0536, + "step": 79720 + }, + { + "epoch": 5.216225057245666, + "grad_norm": 0.8380576372146606, + "learning_rate": 6.340263255971594e-05, + "loss": 0.0582, + "step": 79730 + }, + { + "epoch": 5.216879293424927, + "grad_norm": 0.7339340448379517, + "learning_rate": 6.33937825565065e-05, + "loss": 0.0551, + "step": 79740 + }, + { + "epoch": 5.217533529604187, + "grad_norm": 0.9456587433815002, + "learning_rate": 6.338493210122177e-05, + "loss": 0.0566, + "step": 79750 + }, + { + "epoch": 5.218187765783448, + "grad_norm": 0.8973842859268188, + "learning_rate": 6.33760811941605e-05, + "loss": 0.0538, + "step": 79760 + }, + { + "epoch": 5.218842001962709, + "grad_norm": 1.1991941928863525, + "learning_rate": 6.336722983562138e-05, + "loss": 0.0595, + "step": 79770 + }, + { + "epoch": 5.21949623814197, + "grad_norm": 0.8048141598701477, + "learning_rate": 6.335837802590322e-05, + "loss": 0.0571, + "step": 79780 + }, + { + "epoch": 5.22015047432123, + "grad_norm": 0.7296687960624695, + "learning_rate": 6.334952576530475e-05, + "loss": 0.0605, + "step": 79790 + }, + { + "epoch": 5.220804710500491, + "grad_norm": 1.0618641376495361, + "learning_rate": 6.334067305412479e-05, + "loss": 0.0621, + "step": 79800 + }, + { + "epoch": 5.221458946679752, + "grad_norm": 1.0293445587158203, + "learning_rate": 6.333181989266213e-05, + "loss": 0.0624, + "step": 79810 + }, + { + "epoch": 5.222113182859012, + "grad_norm": 0.7226553559303284, + "learning_rate": 6.332296628121557e-05, + "loss": 0.0597, + "step": 79820 + }, + { + "epoch": 5.222767419038273, + "grad_norm": 0.8370351195335388, + "learning_rate": 6.331411222008397e-05, + "loss": 0.0536, + "step": 79830 + }, + { + "epoch": 5.223421655217534, + "grad_norm": 0.97150719165802, + "learning_rate": 6.330525770956615e-05, + "loss": 0.0544, + "step": 79840 + }, + { + "epoch": 5.224075891396795, + "grad_norm": 0.9659034013748169, + "learning_rate": 6.3296402749961e-05, + "loss": 0.0608, + "step": 79850 + }, + { + "epoch": 5.224730127576055, + "grad_norm": 0.9073391556739807, + "learning_rate": 6.328754734156737e-05, + "loss": 0.0587, + "step": 79860 + }, + { + "epoch": 5.225384363755316, + "grad_norm": 0.9242889881134033, + "learning_rate": 6.327869148468418e-05, + "loss": 0.0547, + "step": 79870 + }, + { + "epoch": 5.226038599934577, + "grad_norm": 0.8243046998977661, + "learning_rate": 6.326983517961033e-05, + "loss": 0.06, + "step": 79880 + }, + { + "epoch": 5.226692836113837, + "grad_norm": 0.8463775515556335, + "learning_rate": 6.326097842664473e-05, + "loss": 0.0539, + "step": 79890 + }, + { + "epoch": 5.227347072293098, + "grad_norm": 0.8504155278205872, + "learning_rate": 6.325212122608635e-05, + "loss": 0.0629, + "step": 79900 + }, + { + "epoch": 5.228001308472359, + "grad_norm": 0.6780961751937866, + "learning_rate": 6.324326357823413e-05, + "loss": 0.0494, + "step": 79910 + }, + { + "epoch": 5.22865554465162, + "grad_norm": 0.8905619978904724, + "learning_rate": 6.3234405483387e-05, + "loss": 0.0537, + "step": 79920 + }, + { + "epoch": 5.22930978083088, + "grad_norm": 0.8399018049240112, + "learning_rate": 6.3225546941844e-05, + "loss": 0.0542, + "step": 79930 + }, + { + "epoch": 5.229964017010141, + "grad_norm": 1.064653992652893, + "learning_rate": 6.32166879539041e-05, + "loss": 0.0607, + "step": 79940 + }, + { + "epoch": 5.230618253189402, + "grad_norm": 0.7720884084701538, + "learning_rate": 6.320782851986631e-05, + "loss": 0.047, + "step": 79950 + }, + { + "epoch": 5.231272489368662, + "grad_norm": 0.7760630249977112, + "learning_rate": 6.319896864002968e-05, + "loss": 0.0611, + "step": 79960 + }, + { + "epoch": 5.231926725547923, + "grad_norm": 0.9401276111602783, + "learning_rate": 6.319010831469324e-05, + "loss": 0.0509, + "step": 79970 + }, + { + "epoch": 5.232580961727184, + "grad_norm": 0.865756630897522, + "learning_rate": 6.318124754415605e-05, + "loss": 0.0533, + "step": 79980 + }, + { + "epoch": 5.233235197906445, + "grad_norm": 0.7279103994369507, + "learning_rate": 6.317238632871718e-05, + "loss": 0.0492, + "step": 79990 + }, + { + "epoch": 5.233889434085705, + "grad_norm": 0.7956836223602295, + "learning_rate": 6.316352466867574e-05, + "loss": 0.0549, + "step": 80000 + } + ], + "logging_steps": 10, + "max_steps": 180000, + "num_input_tokens_seen": 0, + "num_train_epochs": 12, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}