{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 4.96875, "learning_rate": 7.964601769911505e-06, "loss": 1.4112, "step": 10 }, { "epoch": 0.016, "grad_norm": 5.3125, "learning_rate": 1.6814159292035402e-05, "loss": 1.4365, "step": 20 }, { "epoch": 0.024, "grad_norm": 2.265625, "learning_rate": 2.5663716814159294e-05, "loss": 1.2118, "step": 30 }, { "epoch": 0.032, "grad_norm": 2.78125, "learning_rate": 3.451327433628319e-05, "loss": 1.1222, "step": 40 }, { "epoch": 0.04, "grad_norm": 1.953125, "learning_rate": 4.3362831858407084e-05, "loss": 1.0566, "step": 50 }, { "epoch": 0.048, "grad_norm": 2.75, "learning_rate": 5.221238938053098e-05, "loss": 1.0392, "step": 60 }, { "epoch": 0.056, "grad_norm": 2.578125, "learning_rate": 6.106194690265487e-05, "loss": 1.1753, "step": 70 }, { "epoch": 0.064, "grad_norm": 1.421875, "learning_rate": 6.991150442477876e-05, "loss": 1.0422, "step": 80 }, { "epoch": 0.072, "grad_norm": 3.765625, "learning_rate": 7.876106194690266e-05, "loss": 1.026, "step": 90 }, { "epoch": 0.08, "grad_norm": 2.609375, "learning_rate": 8.761061946902655e-05, "loss": 1.0633, "step": 100 }, { "epoch": 0.088, "grad_norm": 3.171875, "learning_rate": 9.646017699115044e-05, "loss": 1.1138, "step": 110 }, { "epoch": 0.096, "grad_norm": 3.609375, "learning_rate": 9.999932848660433e-05, "loss": 1.1079, "step": 120 }, { "epoch": 0.104, "grad_norm": 2.765625, "learning_rate": 9.99952248589506e-05, "loss": 1.0538, "step": 130 }, { "epoch": 0.112, "grad_norm": 3.5625, "learning_rate": 9.998739097245067e-05, "loss": 0.9637, "step": 140 }, { "epoch": 0.12, "grad_norm": 5.6875, "learning_rate": 9.997582741160886e-05, "loss": 0.8554, "step": 150 }, { "epoch": 0.128, "grad_norm": 7.40625, "learning_rate": 9.99605350392091e-05, "loss": 0.6704, "step": 160 }, { "epoch": 0.136, "grad_norm": 7.03125, "learning_rate": 9.994151499625049e-05, "loss": 0.8075, "step": 170 }, { "epoch": 0.144, "grad_norm": 3.828125, "learning_rate": 9.991876870186222e-05, "loss": 0.7331, "step": 180 }, { "epoch": 0.152, "grad_norm": 3.890625, "learning_rate": 9.98922978531977e-05, "loss": 0.7264, "step": 190 }, { "epoch": 0.16, "grad_norm": 4.15625, "learning_rate": 9.986210442530788e-05, "loss": 0.5792, "step": 200 }, { "epoch": 0.168, "grad_norm": 5.53125, "learning_rate": 9.982819067099396e-05, "loss": 0.6228, "step": 210 }, { "epoch": 0.176, "grad_norm": 5.875, "learning_rate": 9.979055912063925e-05, "loss": 0.7417, "step": 220 }, { "epoch": 0.184, "grad_norm": 3.71875, "learning_rate": 9.974921258202036e-05, "loss": 0.472, "step": 230 }, { "epoch": 0.192, "grad_norm": 4.15625, "learning_rate": 9.970415414009773e-05, "loss": 0.6284, "step": 240 }, { "epoch": 0.2, "grad_norm": 5.1875, "learning_rate": 9.965538715678548e-05, "loss": 0.5349, "step": 250 }, { "epoch": 0.208, "grad_norm": 4.59375, "learning_rate": 9.960291527070051e-05, "loss": 0.5165, "step": 260 }, { "epoch": 0.216, "grad_norm": 4.0625, "learning_rate": 9.954674239689109e-05, "loss": 0.5656, "step": 270 }, { "epoch": 0.224, "grad_norm": 3.875, "learning_rate": 9.948687272654464e-05, "loss": 0.6713, "step": 280 }, { "epoch": 0.232, "grad_norm": 5.375, "learning_rate": 9.942331072667517e-05, "loss": 0.4347, "step": 290 }, { "epoch": 0.24, "grad_norm": 5.875, "learning_rate": 9.935606113978981e-05, "loss": 0.4404, "step": 300 }, { "epoch": 0.248, "grad_norm": 4.28125, "learning_rate": 9.92851289835351e-05, "loss": 0.4865, "step": 310 }, { "epoch": 0.256, "grad_norm": 6.46875, "learning_rate": 9.921051955032253e-05, "loss": 0.5393, "step": 320 }, { "epoch": 0.264, "grad_norm": 9.0625, "learning_rate": 9.913223840693375e-05, "loss": 0.5358, "step": 330 }, { "epoch": 0.272, "grad_norm": 5.0625, "learning_rate": 9.905029139410508e-05, "loss": 0.5756, "step": 340 }, { "epoch": 0.28, "grad_norm": 10.625, "learning_rate": 9.896468462609186e-05, "loss": 0.4554, "step": 350 }, { "epoch": 0.288, "grad_norm": 3.6875, "learning_rate": 9.887542449021214e-05, "loss": 0.3889, "step": 360 }, { "epoch": 0.296, "grad_norm": 6.25, "learning_rate": 9.878251764637023e-05, "loss": 0.4468, "step": 370 }, { "epoch": 0.304, "grad_norm": 5.5, "learning_rate": 9.868597102655968e-05, "loss": 0.4612, "step": 380 }, { "epoch": 0.312, "grad_norm": 5.4375, "learning_rate": 9.858579183434605e-05, "loss": 0.6069, "step": 390 }, { "epoch": 0.32, "grad_norm": 5.84375, "learning_rate": 9.848198754432959e-05, "loss": 0.4464, "step": 400 }, { "epoch": 0.328, "grad_norm": 5.125, "learning_rate": 9.837456590158738e-05, "loss": 0.5598, "step": 410 }, { "epoch": 0.336, "grad_norm": 3.671875, "learning_rate": 9.826353492109555e-05, "loss": 0.4534, "step": 420 }, { "epoch": 0.344, "grad_norm": 7.34375, "learning_rate": 9.814890288713121e-05, "loss": 0.5155, "step": 430 }, { "epoch": 0.352, "grad_norm": 6.875, "learning_rate": 9.803067835265436e-05, "loss": 0.484, "step": 440 }, { "epoch": 0.36, "grad_norm": 3.875, "learning_rate": 9.790887013866973e-05, "loss": 0.4077, "step": 450 }, { "epoch": 0.368, "grad_norm": 5.4375, "learning_rate": 9.778348733356868e-05, "loss": 0.3779, "step": 460 }, { "epoch": 0.376, "grad_norm": 5.34375, "learning_rate": 9.765453929245096e-05, "loss": 0.521, "step": 470 }, { "epoch": 0.384, "grad_norm": 4.8125, "learning_rate": 9.752203563642688e-05, "loss": 0.4114, "step": 480 }, { "epoch": 0.392, "grad_norm": 8.25, "learning_rate": 9.73859862518993e-05, "loss": 0.4361, "step": 490 }, { "epoch": 0.4, "grad_norm": 7.40625, "learning_rate": 9.724640128982605e-05, "loss": 0.518, "step": 500 }, { "epoch": 0.408, "grad_norm": 5.5625, "learning_rate": 9.710329116496259e-05, "loss": 0.4413, "step": 510 }, { "epoch": 0.416, "grad_norm": 5.46875, "learning_rate": 9.695666655508483e-05, "loss": 0.4276, "step": 520 }, { "epoch": 0.424, "grad_norm": 5.375, "learning_rate": 9.680653840019259e-05, "loss": 0.4476, "step": 530 }, { "epoch": 0.432, "grad_norm": 5.875, "learning_rate": 9.665291790169311e-05, "loss": 0.3562, "step": 540 }, { "epoch": 0.44, "grad_norm": 6.4375, "learning_rate": 9.649581652156559e-05, "loss": 0.4511, "step": 550 }, { "epoch": 0.448, "grad_norm": 5.1875, "learning_rate": 9.633524598150568e-05, "loss": 0.3985, "step": 560 }, { "epoch": 0.456, "grad_norm": 4.1875, "learning_rate": 9.617121826205116e-05, "loss": 0.5117, "step": 570 }, { "epoch": 0.464, "grad_norm": 5.5, "learning_rate": 9.600374560168783e-05, "loss": 0.4569, "step": 580 }, { "epoch": 0.472, "grad_norm": 4.5, "learning_rate": 9.583284049593652e-05, "loss": 0.449, "step": 590 }, { "epoch": 0.48, "grad_norm": 4.25, "learning_rate": 9.56585156964207e-05, "loss": 0.4769, "step": 600 }, { "epoch": 0.488, "grad_norm": 5.9375, "learning_rate": 9.548078420991506e-05, "loss": 0.5081, "step": 610 }, { "epoch": 0.496, "grad_norm": 7.4375, "learning_rate": 9.529965929737506e-05, "loss": 0.4803, "step": 620 }, { "epoch": 0.504, "grad_norm": 4.59375, "learning_rate": 9.511515447294748e-05, "loss": 0.5015, "step": 630 }, { "epoch": 0.512, "grad_norm": 6.71875, "learning_rate": 9.49272835029621e-05, "loss": 0.5174, "step": 640 }, { "epoch": 0.52, "grad_norm": 4.875, "learning_rate": 9.47360604049046e-05, "loss": 0.4957, "step": 650 }, { "epoch": 0.528, "grad_norm": 4.71875, "learning_rate": 9.454149944637064e-05, "loss": 0.379, "step": 660 }, { "epoch": 0.536, "grad_norm": 4.96875, "learning_rate": 9.434361514400132e-05, "loss": 0.4857, "step": 670 }, { "epoch": 0.544, "grad_norm": 5.71875, "learning_rate": 9.414242226240012e-05, "loss": 0.4595, "step": 680 }, { "epoch": 0.552, "grad_norm": 6.8125, "learning_rate": 9.393793581303116e-05, "loss": 0.4157, "step": 690 }, { "epoch": 0.56, "grad_norm": 5.03125, "learning_rate": 9.37301710530993e-05, "loss": 0.4432, "step": 700 }, { "epoch": 0.568, "grad_norm": 6.75, "learning_rate": 9.351914348441169e-05, "loss": 0.4695, "step": 710 }, { "epoch": 0.576, "grad_norm": 2.890625, "learning_rate": 9.330486885222114e-05, "loss": 0.3493, "step": 720 }, { "epoch": 0.584, "grad_norm": 3.4375, "learning_rate": 9.308736314405134e-05, "loss": 0.4304, "step": 730 }, { "epoch": 0.592, "grad_norm": 3.1875, "learning_rate": 9.286664258850402e-05, "loss": 0.5057, "step": 740 }, { "epoch": 0.6, "grad_norm": 4.375, "learning_rate": 9.264272365404805e-05, "loss": 0.4159, "step": 750 }, { "epoch": 0.608, "grad_norm": 4.3125, "learning_rate": 9.241562304779072e-05, "loss": 0.3647, "step": 760 }, { "epoch": 0.616, "grad_norm": 10.3125, "learning_rate": 9.21853577142312e-05, "loss": 0.4851, "step": 770 }, { "epoch": 0.624, "grad_norm": 6.3125, "learning_rate": 9.195194483399625e-05, "loss": 0.5071, "step": 780 }, { "epoch": 0.632, "grad_norm": 4.34375, "learning_rate": 9.17154018225583e-05, "loss": 0.3939, "step": 790 }, { "epoch": 0.64, "grad_norm": 4.03125, "learning_rate": 9.147574632893611e-05, "loss": 0.3762, "step": 800 }, { "epoch": 0.648, "grad_norm": 3.1875, "learning_rate": 9.12329962343779e-05, "loss": 0.4427, "step": 810 }, { "epoch": 0.656, "grad_norm": 3.890625, "learning_rate": 9.098716965102716e-05, "loss": 0.357, "step": 820 }, { "epoch": 0.664, "grad_norm": 8.25, "learning_rate": 9.073828492057133e-05, "loss": 0.4071, "step": 830 }, { "epoch": 0.672, "grad_norm": 5.78125, "learning_rate": 9.048636061287325e-05, "loss": 0.4037, "step": 840 }, { "epoch": 0.68, "grad_norm": 3.84375, "learning_rate": 9.023141552458559e-05, "loss": 0.3884, "step": 850 }, { "epoch": 0.688, "grad_norm": 3.453125, "learning_rate": 8.997346867774839e-05, "loss": 0.3641, "step": 860 }, { "epoch": 0.696, "grad_norm": 7.875, "learning_rate": 8.97125393183699e-05, "loss": 0.5387, "step": 870 }, { "epoch": 0.704, "grad_norm": 4.6875, "learning_rate": 8.94486469149904e-05, "loss": 0.4085, "step": 880 }, { "epoch": 0.712, "grad_norm": 6.03125, "learning_rate": 8.918181115722976e-05, "loss": 0.4055, "step": 890 }, { "epoch": 0.72, "grad_norm": 7.375, "learning_rate": 8.891205195431831e-05, "loss": 0.42, "step": 900 }, { "epoch": 0.728, "grad_norm": 3.21875, "learning_rate": 8.863938943361128e-05, "loss": 0.3372, "step": 910 }, { "epoch": 0.736, "grad_norm": 3.65625, "learning_rate": 8.836384393908721e-05, "loss": 0.4544, "step": 920 }, { "epoch": 0.744, "grad_norm": 3.703125, "learning_rate": 8.808543602982993e-05, "loss": 0.4979, "step": 930 }, { "epoch": 0.752, "grad_norm": 2.90625, "learning_rate": 8.780418647849458e-05, "loss": 0.3366, "step": 940 }, { "epoch": 0.76, "grad_norm": 4.53125, "learning_rate": 8.752011626975781e-05, "loss": 0.3778, "step": 950 }, { "epoch": 0.768, "grad_norm": 3.4375, "learning_rate": 8.723324659875201e-05, "loss": 0.4498, "step": 960 }, { "epoch": 0.776, "grad_norm": 4.90625, "learning_rate": 8.694359886948384e-05, "loss": 0.4232, "step": 970 }, { "epoch": 0.784, "grad_norm": 4.15625, "learning_rate": 8.665119469323737e-05, "loss": 0.3602, "step": 980 }, { "epoch": 0.792, "grad_norm": 5.375, "learning_rate": 8.635605588696148e-05, "loss": 0.4095, "step": 990 }, { "epoch": 0.8, "grad_norm": 3.5625, "learning_rate": 8.605820447164206e-05, "loss": 0.405, "step": 1000 }, { "epoch": 0.808, "grad_norm": 4.40625, "learning_rate": 8.575766267065905e-05, "loss": 0.3137, "step": 1010 }, { "epoch": 0.816, "grad_norm": 5.65625, "learning_rate": 8.54544529081283e-05, "loss": 0.3701, "step": 1020 }, { "epoch": 0.824, "grad_norm": 5.0, "learning_rate": 8.514859780722833e-05, "loss": 0.3759, "step": 1030 }, { "epoch": 0.832, "grad_norm": 4.9375, "learning_rate": 8.484012018851246e-05, "loss": 0.3032, "step": 1040 }, { "epoch": 0.84, "grad_norm": 4.375, "learning_rate": 8.452904306820618e-05, "loss": 0.4171, "step": 1050 }, { "epoch": 0.848, "grad_norm": 4.53125, "learning_rate": 8.421538965648966e-05, "loss": 0.374, "step": 1060 }, { "epoch": 0.856, "grad_norm": 4.3125, "learning_rate": 8.389918335576623e-05, "loss": 0.3358, "step": 1070 }, { "epoch": 0.864, "grad_norm": 4.21875, "learning_rate": 8.358044775891605e-05, "loss": 0.3586, "step": 1080 }, { "epoch": 0.872, "grad_norm": 4.15625, "learning_rate": 8.325920664753595e-05, "loss": 0.4036, "step": 1090 }, { "epoch": 0.88, "grad_norm": 4.625, "learning_rate": 8.293548399016491e-05, "loss": 0.3673, "step": 1100 }, { "epoch": 0.888, "grad_norm": 2.625, "learning_rate": 8.260930394049583e-05, "loss": 0.3388, "step": 1110 }, { "epoch": 0.896, "grad_norm": 4.78125, "learning_rate": 8.228069083557328e-05, "loss": 0.427, "step": 1120 }, { "epoch": 0.904, "grad_norm": 4.21875, "learning_rate": 8.194966919397767e-05, "loss": 0.3926, "step": 1130 }, { "epoch": 0.912, "grad_norm": 5.84375, "learning_rate": 8.161626371399591e-05, "loss": 0.3654, "step": 1140 }, { "epoch": 0.92, "grad_norm": 3.71875, "learning_rate": 8.128049927177854e-05, "loss": 0.4047, "step": 1150 }, { "epoch": 0.928, "grad_norm": 5.0, "learning_rate": 8.094240091948375e-05, "loss": 0.4114, "step": 1160 }, { "epoch": 0.936, "grad_norm": 6.625, "learning_rate": 8.06019938834081e-05, "loss": 0.4485, "step": 1170 }, { "epoch": 0.944, "grad_norm": 4.4375, "learning_rate": 8.025930356210439e-05, "loss": 0.3833, "step": 1180 }, { "epoch": 0.952, "grad_norm": 3.171875, "learning_rate": 7.991435552448657e-05, "loss": 0.3742, "step": 1190 }, { "epoch": 0.96, "grad_norm": 4.65625, "learning_rate": 7.956717550792199e-05, "loss": 0.3284, "step": 1200 }, { "epoch": 0.968, "grad_norm": 4.4375, "learning_rate": 7.921778941631113e-05, "loss": 0.3862, "step": 1210 }, { "epoch": 0.976, "grad_norm": 8.9375, "learning_rate": 7.886622331815477e-05, "loss": 0.4144, "step": 1220 }, { "epoch": 0.984, "grad_norm": 3.875, "learning_rate": 7.851250344460902e-05, "loss": 0.3654, "step": 1230 }, { "epoch": 0.992, "grad_norm": 2.59375, "learning_rate": 7.815665618752812e-05, "loss": 0.3808, "step": 1240 }, { "epoch": 1.0, "grad_norm": 5.0625, "learning_rate": 7.77987080974953e-05, "loss": 0.3482, "step": 1250 }, { "epoch": 1.008, "grad_norm": 3.28125, "learning_rate": 7.743868588184176e-05, "loss": 0.3312, "step": 1260 }, { "epoch": 1.016, "grad_norm": 4.375, "learning_rate": 7.707661640265401e-05, "loss": 0.37, "step": 1270 }, { "epoch": 1.024, "grad_norm": 4.40625, "learning_rate": 7.67125266747696e-05, "loss": 0.3253, "step": 1280 }, { "epoch": 1.032, "grad_norm": 4.0625, "learning_rate": 7.634644386376149e-05, "loss": 0.4361, "step": 1290 }, { "epoch": 1.04, "grad_norm": 3.953125, "learning_rate": 7.597839528391114e-05, "loss": 0.3981, "step": 1300 }, { "epoch": 1.048, "grad_norm": 7.6875, "learning_rate": 7.560840839617056e-05, "loss": 0.3634, "step": 1310 }, { "epoch": 1.056, "grad_norm": 4.03125, "learning_rate": 7.523651080611341e-05, "loss": 0.3653, "step": 1320 }, { "epoch": 1.064, "grad_norm": 3.09375, "learning_rate": 7.48627302618752e-05, "loss": 0.3433, "step": 1330 }, { "epoch": 1.072, "grad_norm": 4.21875, "learning_rate": 7.448709465208299e-05, "loss": 0.3587, "step": 1340 }, { "epoch": 1.08, "grad_norm": 5.375, "learning_rate": 7.410963200377458e-05, "loss": 0.346, "step": 1350 }, { "epoch": 1.088, "grad_norm": 4.0625, "learning_rate": 7.373037048030731e-05, "loss": 0.4562, "step": 1360 }, { "epoch": 1.096, "grad_norm": 3.75, "learning_rate": 7.334933837925675e-05, "loss": 0.4333, "step": 1370 }, { "epoch": 1.104, "grad_norm": 3.46875, "learning_rate": 7.296656413030531e-05, "loss": 0.306, "step": 1380 }, { "epoch": 1.112, "grad_norm": 5.25, "learning_rate": 7.25820762931211e-05, "loss": 0.4095, "step": 1390 }, { "epoch": 1.12, "grad_norm": 3.578125, "learning_rate": 7.219590355522697e-05, "loss": 0.369, "step": 1400 }, { "epoch": 1.1280000000000001, "grad_norm": 4.96875, "learning_rate": 7.180807472986009e-05, "loss": 0.3763, "step": 1410 }, { "epoch": 1.1360000000000001, "grad_norm": 5.25, "learning_rate": 7.141861875382215e-05, "loss": 0.4269, "step": 1420 }, { "epoch": 1.144, "grad_norm": 3.53125, "learning_rate": 7.102756468532027e-05, "loss": 0.4017, "step": 1430 }, { "epoch": 1.152, "grad_norm": 4.6875, "learning_rate": 7.063494170179898e-05, "loss": 0.3601, "step": 1440 }, { "epoch": 1.16, "grad_norm": 3.8125, "learning_rate": 7.024077909776309e-05, "loss": 0.3678, "step": 1450 }, { "epoch": 1.168, "grad_norm": 5.25, "learning_rate": 6.984510628259212e-05, "loss": 0.3732, "step": 1460 }, { "epoch": 1.176, "grad_norm": 2.140625, "learning_rate": 6.94479527783459e-05, "loss": 0.3332, "step": 1470 }, { "epoch": 1.184, "grad_norm": 4.5, "learning_rate": 6.904934821756184e-05, "loss": 0.3887, "step": 1480 }, { "epoch": 1.192, "grad_norm": 2.296875, "learning_rate": 6.864932234104409e-05, "loss": 0.3196, "step": 1490 }, { "epoch": 1.2, "grad_norm": 4.15625, "learning_rate": 6.824790499564435e-05, "loss": 0.3256, "step": 1500 }, { "epoch": 1.208, "grad_norm": 3.6875, "learning_rate": 6.784512613203511e-05, "loss": 0.3074, "step": 1510 }, { "epoch": 1.216, "grad_norm": 3.0, "learning_rate": 6.744101580247481e-05, "loss": 0.35, "step": 1520 }, { "epoch": 1.224, "grad_norm": 3.828125, "learning_rate": 6.703560415856565e-05, "loss": 0.3731, "step": 1530 }, { "epoch": 1.232, "grad_norm": 5.625, "learning_rate": 6.662892144900388e-05, "loss": 0.3769, "step": 1540 }, { "epoch": 1.24, "grad_norm": 8.1875, "learning_rate": 6.62209980173229e-05, "loss": 0.4385, "step": 1550 }, { "epoch": 1.248, "grad_norm": 5.9375, "learning_rate": 6.581186429962922e-05, "loss": 0.3563, "step": 1560 }, { "epoch": 1.256, "grad_norm": 3.53125, "learning_rate": 6.54015508223316e-05, "loss": 0.3225, "step": 1570 }, { "epoch": 1.264, "grad_norm": 4.125, "learning_rate": 6.499008819986339e-05, "loss": 0.3246, "step": 1580 }, { "epoch": 1.272, "grad_norm": 4.34375, "learning_rate": 6.457750713239828e-05, "loss": 0.304, "step": 1590 }, { "epoch": 1.28, "grad_norm": 4.75, "learning_rate": 6.41638384035597e-05, "loss": 0.383, "step": 1600 }, { "epoch": 1.288, "grad_norm": 4.46875, "learning_rate": 6.374911287812406e-05, "loss": 0.331, "step": 1610 }, { "epoch": 1.296, "grad_norm": 3.984375, "learning_rate": 6.333336149971776e-05, "loss": 0.3022, "step": 1620 }, { "epoch": 1.304, "grad_norm": 5.53125, "learning_rate": 6.291661528850844e-05, "loss": 0.3257, "step": 1630 }, { "epoch": 1.312, "grad_norm": 4.03125, "learning_rate": 6.249890533889054e-05, "loss": 0.3071, "step": 1640 }, { "epoch": 1.32, "grad_norm": 4.25, "learning_rate": 6.208026281716521e-05, "loss": 0.3833, "step": 1650 }, { "epoch": 1.328, "grad_norm": 3.109375, "learning_rate": 6.166071895921496e-05, "loss": 0.3378, "step": 1660 }, { "epoch": 1.336, "grad_norm": 3.5, "learning_rate": 6.124030506817309e-05, "loss": 0.31, "step": 1670 }, { "epoch": 1.3439999999999999, "grad_norm": 2.65625, "learning_rate": 6.0819052512088057e-05, "loss": 0.3139, "step": 1680 }, { "epoch": 1.3519999999999999, "grad_norm": 4.90625, "learning_rate": 6.039699272158305e-05, "loss": 0.4388, "step": 1690 }, { "epoch": 1.3599999999999999, "grad_norm": 5.59375, "learning_rate": 5.997415718751086e-05, "loss": 0.3989, "step": 1700 }, { "epoch": 1.3679999999999999, "grad_norm": 5.5, "learning_rate": 5.955057745860435e-05, "loss": 0.3977, "step": 1710 }, { "epoch": 1.376, "grad_norm": 6.25, "learning_rate": 5.9126285139122406e-05, "loss": 0.3527, "step": 1720 }, { "epoch": 1.384, "grad_norm": 3.28125, "learning_rate": 5.8701311886491947e-05, "loss": 0.4044, "step": 1730 }, { "epoch": 1.392, "grad_norm": 5.5625, "learning_rate": 5.827568940894593e-05, "loss": 0.374, "step": 1740 }, { "epoch": 1.4, "grad_norm": 3.625, "learning_rate": 5.7849449463157435e-05, "loss": 0.3479, "step": 1750 }, { "epoch": 1.408, "grad_norm": 4.84375, "learning_rate": 5.742262385187028e-05, "loss": 0.3666, "step": 1760 }, { "epoch": 1.416, "grad_norm": 5.4375, "learning_rate": 5.699524442152613e-05, "loss": 0.3707, "step": 1770 }, { "epoch": 1.424, "grad_norm": 4.4375, "learning_rate": 5.656734305988839e-05, "loss": 0.3847, "step": 1780 }, { "epoch": 1.432, "grad_norm": 3.96875, "learning_rate": 5.613895169366292e-05, "loss": 0.3515, "step": 1790 }, { "epoch": 1.44, "grad_norm": 3.734375, "learning_rate": 5.571010228611597e-05, "loss": 0.3763, "step": 1800 }, { "epoch": 1.448, "grad_norm": 5.71875, "learning_rate": 5.528082683468934e-05, "loss": 0.3548, "step": 1810 }, { "epoch": 1.456, "grad_norm": 5.03125, "learning_rate": 5.485115736861288e-05, "loss": 0.3903, "step": 1820 }, { "epoch": 1.464, "grad_norm": 4.375, "learning_rate": 5.442112594651484e-05, "loss": 0.257, "step": 1830 }, { "epoch": 1.472, "grad_norm": 5.46875, "learning_rate": 5.399076465402979e-05, "loss": 0.3424, "step": 1840 }, { "epoch": 1.48, "grad_norm": 3.40625, "learning_rate": 5.356010560140475e-05, "loss": 0.3317, "step": 1850 }, { "epoch": 1.488, "grad_norm": 5.0, "learning_rate": 5.312918092110325e-05, "loss": 0.2753, "step": 1860 }, { "epoch": 1.496, "grad_norm": 3.84375, "learning_rate": 5.269802276540795e-05, "loss": 0.3318, "step": 1870 }, { "epoch": 1.504, "grad_norm": 5.71875, "learning_rate": 5.226666330402164e-05, "loss": 0.3836, "step": 1880 }, { "epoch": 1.512, "grad_norm": 3.671875, "learning_rate": 5.1835134721666956e-05, "loss": 0.3498, "step": 1890 }, { "epoch": 1.52, "grad_norm": 2.90625, "learning_rate": 5.1403469215685094e-05, "loss": 0.4228, "step": 1900 }, { "epoch": 1.528, "grad_norm": 4.4375, "learning_rate": 5.097169899363342e-05, "loss": 0.3703, "step": 1910 }, { "epoch": 1.536, "grad_norm": 3.90625, "learning_rate": 5.053985627088238e-05, "loss": 0.3816, "step": 1920 }, { "epoch": 1.544, "grad_norm": 2.71875, "learning_rate": 5.010797326821189e-05, "loss": 0.3842, "step": 1930 }, { "epoch": 1.552, "grad_norm": 3.875, "learning_rate": 4.9676082209407254e-05, "loss": 0.3848, "step": 1940 }, { "epoch": 1.56, "grad_norm": 3.484375, "learning_rate": 4.924421531885481e-05, "loss": 0.3416, "step": 1950 }, { "epoch": 1.568, "grad_norm": 4.1875, "learning_rate": 4.881240481913773e-05, "loss": 0.3407, "step": 1960 }, { "epoch": 1.576, "grad_norm": 3.921875, "learning_rate": 4.838068292863164e-05, "loss": 0.3319, "step": 1970 }, { "epoch": 1.584, "grad_norm": 4.4375, "learning_rate": 4.7949081859100896e-05, "loss": 0.3979, "step": 1980 }, { "epoch": 1.592, "grad_norm": 4.84375, "learning_rate": 4.7517633813295114e-05, "loss": 0.4184, "step": 1990 }, { "epoch": 1.6, "grad_norm": 3.890625, "learning_rate": 4.708637098254644e-05, "loss": 0.3959, "step": 2000 }, { "epoch": 1.608, "grad_norm": 4.8125, "learning_rate": 4.6655325544367715e-05, "loss": 0.313, "step": 2010 }, { "epoch": 1.616, "grad_norm": 4.21875, "learning_rate": 4.6224529660051593e-05, "loss": 0.3012, "step": 2020 }, { "epoch": 1.624, "grad_norm": 3.734375, "learning_rate": 4.579401547227096e-05, "loss": 0.2532, "step": 2030 }, { "epoch": 1.6320000000000001, "grad_norm": 2.953125, "learning_rate": 4.53638151026807e-05, "loss": 0.2714, "step": 2040 }, { "epoch": 1.6400000000000001, "grad_norm": 4.1875, "learning_rate": 4.493396064952093e-05, "loss": 0.3468, "step": 2050 }, { "epoch": 1.6480000000000001, "grad_norm": 5.125, "learning_rate": 4.450448418522221e-05, "loss": 0.4547, "step": 2060 }, { "epoch": 1.6560000000000001, "grad_norm": 3.5, "learning_rate": 4.4075417754012475e-05, "loss": 0.2839, "step": 2070 }, { "epoch": 1.6640000000000001, "grad_norm": 5.84375, "learning_rate": 4.364679336952609e-05, "loss": 0.3426, "step": 2080 }, { "epoch": 1.6720000000000002, "grad_norm": 4.84375, "learning_rate": 4.321864301241535e-05, "loss": 0.3325, "step": 2090 }, { "epoch": 1.6800000000000002, "grad_norm": 4.46875, "learning_rate": 4.279099862796427e-05, "loss": 0.314, "step": 2100 }, { "epoch": 1.688, "grad_norm": 2.8125, "learning_rate": 4.23638921237051e-05, "loss": 0.4189, "step": 2110 }, { "epoch": 1.696, "grad_norm": 3.59375, "learning_rate": 4.1937355367037516e-05, "loss": 0.3436, "step": 2120 }, { "epoch": 1.704, "grad_norm": 6.21875, "learning_rate": 4.151142018285112e-05, "loss": 0.3681, "step": 2130 }, { "epoch": 1.712, "grad_norm": 4.0625, "learning_rate": 4.1086118351150785e-05, "loss": 0.3716, "step": 2140 }, { "epoch": 1.72, "grad_norm": 2.703125, "learning_rate": 4.066148160468543e-05, "loss": 0.2761, "step": 2150 }, { "epoch": 1.728, "grad_norm": 3.765625, "learning_rate": 4.023754162658051e-05, "loss": 0.2904, "step": 2160 }, { "epoch": 1.736, "grad_norm": 5.4375, "learning_rate": 3.981433004797395e-05, "loss": 0.3563, "step": 2170 }, { "epoch": 1.744, "grad_norm": 4.125, "learning_rate": 3.939187844565616e-05, "loss": 0.3248, "step": 2180 }, { "epoch": 1.752, "grad_norm": 5.71875, "learning_rate": 3.897021833971386e-05, "loss": 0.3246, "step": 2190 }, { "epoch": 1.76, "grad_norm": 4.21875, "learning_rate": 3.8549381191178516e-05, "loss": 0.4073, "step": 2200 }, { "epoch": 1.768, "grad_norm": 4.4375, "learning_rate": 3.8129398399678814e-05, "loss": 0.4147, "step": 2210 }, { "epoch": 1.776, "grad_norm": 4.4375, "learning_rate": 3.771030130109785e-05, "loss": 0.2378, "step": 2220 }, { "epoch": 1.784, "grad_norm": 3.65625, "learning_rate": 3.729212116523518e-05, "loss": 0.3305, "step": 2230 }, { "epoch": 1.792, "grad_norm": 3.09375, "learning_rate": 3.6874889193473646e-05, "loss": 0.3865, "step": 2240 }, { "epoch": 1.8, "grad_norm": 4.5, "learning_rate": 3.64586365164514e-05, "loss": 0.3443, "step": 2250 }, { "epoch": 1.808, "grad_norm": 2.71875, "learning_rate": 3.604339419173912e-05, "loss": 0.2762, "step": 2260 }, { "epoch": 1.8159999999999998, "grad_norm": 5.15625, "learning_rate": 3.5629193201522794e-05, "loss": 0.3787, "step": 2270 }, { "epoch": 1.8239999999999998, "grad_norm": 5.53125, "learning_rate": 3.521606445029208e-05, "loss": 0.4157, "step": 2280 }, { "epoch": 1.8319999999999999, "grad_norm": 5.34375, "learning_rate": 3.480403876253432e-05, "loss": 0.3345, "step": 2290 }, { "epoch": 1.8399999999999999, "grad_norm": 3.3125, "learning_rate": 3.4393146880434845e-05, "loss": 0.3111, "step": 2300 }, { "epoch": 1.8479999999999999, "grad_norm": 4.03125, "learning_rate": 3.398341946158311e-05, "loss": 0.3763, "step": 2310 }, { "epoch": 1.8559999999999999, "grad_norm": 3.703125, "learning_rate": 3.357488707668529e-05, "loss": 0.3246, "step": 2320 }, { "epoch": 1.8639999999999999, "grad_norm": 4.65625, "learning_rate": 3.316758020728327e-05, "loss": 0.3852, "step": 2330 }, { "epoch": 1.8719999999999999, "grad_norm": 3.859375, "learning_rate": 3.276152924348046e-05, "loss": 0.3295, "step": 2340 }, { "epoch": 1.88, "grad_norm": 5.15625, "learning_rate": 3.2356764481674254e-05, "loss": 0.3567, "step": 2350 }, { "epoch": 1.888, "grad_norm": 3.421875, "learning_rate": 3.1953316122295554e-05, "loss": 0.3091, "step": 2360 }, { "epoch": 1.896, "grad_norm": 3.265625, "learning_rate": 3.1551214267555416e-05, "loss": 0.3847, "step": 2370 }, { "epoch": 1.904, "grad_norm": 6.09375, "learning_rate": 3.1150488919199124e-05, "loss": 0.3958, "step": 2380 }, { "epoch": 1.912, "grad_norm": 4.03125, "learning_rate": 3.075116997626764e-05, "loss": 0.384, "step": 2390 }, { "epoch": 1.92, "grad_norm": 4.0625, "learning_rate": 3.0353287232866736e-05, "loss": 0.3349, "step": 2400 }, { "epoch": 1.928, "grad_norm": 4.375, "learning_rate": 2.995687037594408e-05, "loss": 0.3801, "step": 2410 }, { "epoch": 1.936, "grad_norm": 5.09375, "learning_rate": 2.9561948983074174e-05, "loss": 0.3281, "step": 2420 }, { "epoch": 1.944, "grad_norm": 6.8125, "learning_rate": 2.916855252025149e-05, "loss": 0.3549, "step": 2430 }, { "epoch": 1.952, "grad_norm": 4.125, "learning_rate": 2.877671033969193e-05, "loss": 0.4092, "step": 2440 }, { "epoch": 1.96, "grad_norm": 4.21875, "learning_rate": 2.8386451677642878e-05, "loss": 0.3866, "step": 2450 }, { "epoch": 1.968, "grad_norm": 3.078125, "learning_rate": 2.7997805652201714e-05, "loss": 0.3484, "step": 2460 }, { "epoch": 1.976, "grad_norm": 4.09375, "learning_rate": 2.7610801261143283e-05, "loss": 0.3496, "step": 2470 }, { "epoch": 1.984, "grad_norm": 4.6875, "learning_rate": 2.7225467379756314e-05, "loss": 0.3691, "step": 2480 }, { "epoch": 1.992, "grad_norm": 5.5, "learning_rate": 2.6841832758689002e-05, "loss": 0.3698, "step": 2490 }, { "epoch": 2.0, "grad_norm": 3.875, "learning_rate": 2.645992602180377e-05, "loss": 0.3577, "step": 2500 }, { "epoch": 2.008, "grad_norm": 4.40625, "learning_rate": 2.607977566404164e-05, "loss": 0.2871, "step": 2510 }, { "epoch": 2.016, "grad_norm": 4.0625, "learning_rate": 2.570141004929612e-05, "loss": 0.3426, "step": 2520 }, { "epoch": 2.024, "grad_norm": 3.5, "learning_rate": 2.5324857408296994e-05, "loss": 0.2656, "step": 2530 }, { "epoch": 2.032, "grad_norm": 3.578125, "learning_rate": 2.4950145836503836e-05, "loss": 0.3473, "step": 2540 }, { "epoch": 2.04, "grad_norm": 3.953125, "learning_rate": 2.4577303292009822e-05, "loss": 0.3588, "step": 2550 }, { "epoch": 2.048, "grad_norm": 4.75, "learning_rate": 2.4206357593455743e-05, "loss": 0.3953, "step": 2560 }, { "epoch": 2.056, "grad_norm": 3.96875, "learning_rate": 2.383733641795428e-05, "loss": 0.3209, "step": 2570 }, { "epoch": 2.064, "grad_norm": 2.515625, "learning_rate": 2.3470267299025068e-05, "loss": 0.3299, "step": 2580 }, { "epoch": 2.072, "grad_norm": 3.375, "learning_rate": 2.3105177624540252e-05, "loss": 0.2311, "step": 2590 }, { "epoch": 2.08, "grad_norm": 4.46875, "learning_rate": 2.274209463468117e-05, "loss": 0.3035, "step": 2600 }, { "epoch": 2.088, "grad_norm": 2.578125, "learning_rate": 2.2381045419905655e-05, "loss": 0.3344, "step": 2610 }, { "epoch": 2.096, "grad_norm": 5.28125, "learning_rate": 2.2022056918927037e-05, "loss": 0.2794, "step": 2620 }, { "epoch": 2.104, "grad_norm": 5.25, "learning_rate": 2.166515591670394e-05, "loss": 0.3416, "step": 2630 }, { "epoch": 2.112, "grad_norm": 3.96875, "learning_rate": 2.1310369042441985e-05, "loss": 0.3152, "step": 2640 }, { "epoch": 2.12, "grad_norm": 3.203125, "learning_rate": 2.0957722767606774e-05, "loss": 0.3015, "step": 2650 }, { "epoch": 2.128, "grad_norm": 3.78125, "learning_rate": 2.0607243403948863e-05, "loss": 0.3843, "step": 2660 }, { "epoch": 2.136, "grad_norm": 6.34375, "learning_rate": 2.0258957101540625e-05, "loss": 0.3299, "step": 2670 }, { "epoch": 2.144, "grad_norm": 4.1875, "learning_rate": 1.9912889846825038e-05, "loss": 0.3636, "step": 2680 }, { "epoch": 2.152, "grad_norm": 3.6875, "learning_rate": 1.956906746067683e-05, "loss": 0.3596, "step": 2690 }, { "epoch": 2.16, "grad_norm": 2.984375, "learning_rate": 1.922751559647591e-05, "loss": 0.3796, "step": 2700 }, { "epoch": 2.168, "grad_norm": 3.21875, "learning_rate": 1.888825973819336e-05, "loss": 0.3175, "step": 2710 }, { "epoch": 2.176, "grad_norm": 4.03125, "learning_rate": 1.8551325198489887e-05, "loss": 0.2928, "step": 2720 }, { "epoch": 2.184, "grad_norm": 4.1875, "learning_rate": 1.8216737116827378e-05, "loss": 0.2791, "step": 2730 }, { "epoch": 2.192, "grad_norm": 5.71875, "learning_rate": 1.7884520457592984e-05, "loss": 0.3925, "step": 2740 }, { "epoch": 2.2, "grad_norm": 3.28125, "learning_rate": 1.755470000823667e-05, "loss": 0.2967, "step": 2750 }, { "epoch": 2.208, "grad_norm": 2.203125, "learning_rate": 1.7227300377421574e-05, "loss": 0.2475, "step": 2760 }, { "epoch": 2.216, "grad_norm": 3.515625, "learning_rate": 1.6902345993188017e-05, "loss": 0.34, "step": 2770 }, { "epoch": 2.224, "grad_norm": 3.453125, "learning_rate": 1.6579861101130896e-05, "loss": 0.3418, "step": 2780 }, { "epoch": 2.232, "grad_norm": 6.5, "learning_rate": 1.6259869762590503e-05, "loss": 0.4639, "step": 2790 }, { "epoch": 2.24, "grad_norm": 5.0625, "learning_rate": 1.5942395852857466e-05, "loss": 0.4252, "step": 2800 }, { "epoch": 2.248, "grad_norm": 4.71875, "learning_rate": 1.5627463059391173e-05, "loss": 0.3562, "step": 2810 }, { "epoch": 2.2560000000000002, "grad_norm": 3.78125, "learning_rate": 1.531509488005257e-05, "loss": 0.2792, "step": 2820 }, { "epoch": 2.2640000000000002, "grad_norm": 5.25, "learning_rate": 1.5005314621350709e-05, "loss": 0.2659, "step": 2830 }, { "epoch": 2.2720000000000002, "grad_norm": 4.03125, "learning_rate": 1.4698145396704044e-05, "loss": 0.2647, "step": 2840 }, { "epoch": 2.2800000000000002, "grad_norm": 5.3125, "learning_rate": 1.4393610124715696e-05, "loss": 0.2826, "step": 2850 }, { "epoch": 2.288, "grad_norm": 4.59375, "learning_rate": 1.4091731527463526e-05, "loss": 0.2643, "step": 2860 }, { "epoch": 2.296, "grad_norm": 5.75, "learning_rate": 1.3792532128804803e-05, "loss": 0.3758, "step": 2870 }, { "epoch": 2.304, "grad_norm": 6.25, "learning_rate": 1.3496034252695599e-05, "loss": 0.3, "step": 2880 }, { "epoch": 2.312, "grad_norm": 5.1875, "learning_rate": 1.3202260021525158e-05, "loss": 0.3376, "step": 2890 }, { "epoch": 2.32, "grad_norm": 4.90625, "learning_rate": 1.2911231354465303e-05, "loss": 0.3686, "step": 2900 }, { "epoch": 2.328, "grad_norm": 3.859375, "learning_rate": 1.262296996583504e-05, "loss": 0.3372, "step": 2910 }, { "epoch": 2.336, "grad_norm": 3.421875, "learning_rate": 1.2337497363480317e-05, "loss": 0.3071, "step": 2920 }, { "epoch": 2.344, "grad_norm": 4.15625, "learning_rate": 1.2054834847169316e-05, "loss": 0.3724, "step": 2930 }, { "epoch": 2.352, "grad_norm": 4.21875, "learning_rate": 1.1775003507003236e-05, "loss": 0.2919, "step": 2940 }, { "epoch": 2.36, "grad_norm": 4.0625, "learning_rate": 1.1498024221842735e-05, "loss": 0.2496, "step": 2950 }, { "epoch": 2.368, "grad_norm": 5.28125, "learning_rate": 1.1223917657750033e-05, "loss": 0.265, "step": 2960 }, { "epoch": 2.376, "grad_norm": 6.03125, "learning_rate": 1.095270426644705e-05, "loss": 0.3083, "step": 2970 }, { "epoch": 2.384, "grad_norm": 4.25, "learning_rate": 1.0684404283789385e-05, "loss": 0.3392, "step": 2980 }, { "epoch": 2.392, "grad_norm": 3.765625, "learning_rate": 1.0419037728256564e-05, "loss": 0.3743, "step": 2990 }, { "epoch": 2.4, "grad_norm": 4.5625, "learning_rate": 1.015662439945832e-05, "loss": 0.3846, "step": 3000 }, { "epoch": 2.408, "grad_norm": 4.28125, "learning_rate": 9.89718387665734e-06, "loss": 0.2818, "step": 3010 }, { "epoch": 2.416, "grad_norm": 4.1875, "learning_rate": 9.640735517308435e-06, "loss": 0.3442, "step": 3020 }, { "epoch": 2.424, "grad_norm": 5.28125, "learning_rate": 9.387298455614191e-06, "loss": 0.2982, "step": 3030 }, { "epoch": 2.432, "grad_norm": 7.5, "learning_rate": 9.136891601097347e-06, "loss": 0.3924, "step": 3040 }, { "epoch": 2.44, "grad_norm": 3.15625, "learning_rate": 8.889533637189895e-06, "loss": 0.2838, "step": 3050 }, { "epoch": 2.448, "grad_norm": 6.125, "learning_rate": 8.645243019839112e-06, "loss": 0.3035, "step": 3060 }, { "epoch": 2.456, "grad_norm": 3.609375, "learning_rate": 8.404037976130458e-06, "loss": 0.3713, "step": 3070 }, { "epoch": 2.464, "grad_norm": 4.84375, "learning_rate": 8.16593650292764e-06, "loss": 0.3242, "step": 3080 }, { "epoch": 2.472, "grad_norm": 3.21875, "learning_rate": 7.930956365529818e-06, "loss": 0.3214, "step": 3090 }, { "epoch": 2.48, "grad_norm": 2.953125, "learning_rate": 7.699115096346139e-06, "loss": 0.3072, "step": 3100 }, { "epoch": 2.488, "grad_norm": 4.5, "learning_rate": 7.4704299935875185e-06, "loss": 0.2528, "step": 3110 }, { "epoch": 2.496, "grad_norm": 4.28125, "learning_rate": 7.244918119976035e-06, "loss": 0.3366, "step": 3120 }, { "epoch": 2.504, "grad_norm": 5.53125, "learning_rate": 7.022596301471868e-06, "loss": 0.3603, "step": 3130 }, { "epoch": 2.512, "grad_norm": 4.28125, "learning_rate": 6.803481126017808e-06, "loss": 0.2996, "step": 3140 }, { "epoch": 2.52, "grad_norm": 3.90625, "learning_rate": 6.587588942301626e-06, "loss": 0.3519, "step": 3150 }, { "epoch": 2.528, "grad_norm": 5.3125, "learning_rate": 6.374935858536257e-06, "loss": 0.2668, "step": 3160 }, { "epoch": 2.536, "grad_norm": 5.25, "learning_rate": 6.165537741257971e-06, "loss": 0.3093, "step": 3170 }, { "epoch": 2.544, "grad_norm": 5.0, "learning_rate": 5.959410214142419e-06, "loss": 0.3223, "step": 3180 }, { "epoch": 2.552, "grad_norm": 5.15625, "learning_rate": 5.756568656839056e-06, "loss": 0.4137, "step": 3190 }, { "epoch": 2.56, "grad_norm": 5.78125, "learning_rate": 5.557028203823522e-06, "loss": 0.3785, "step": 3200 }, { "epoch": 2.568, "grad_norm": 3.59375, "learning_rate": 5.360803743268494e-06, "loss": 0.3343, "step": 3210 }, { "epoch": 2.576, "grad_norm": 4.96875, "learning_rate": 5.167909915932801e-06, "loss": 0.3217, "step": 3220 }, { "epoch": 2.584, "grad_norm": 3.6875, "learning_rate": 4.9783611140690415e-06, "loss": 0.3157, "step": 3230 }, { "epoch": 2.592, "grad_norm": 3.84375, "learning_rate": 4.7921714803498165e-06, "loss": 0.2983, "step": 3240 }, { "epoch": 2.6, "grad_norm": 5.3125, "learning_rate": 4.609354906812374e-06, "loss": 0.3362, "step": 3250 }, { "epoch": 2.608, "grad_norm": 3.953125, "learning_rate": 4.429925033822252e-06, "loss": 0.3844, "step": 3260 }, { "epoch": 2.616, "grad_norm": 4.28125, "learning_rate": 4.253895249055412e-06, "loss": 0.2974, "step": 3270 }, { "epoch": 2.624, "grad_norm": 3.734375, "learning_rate": 4.0812786864994566e-06, "loss": 0.3442, "step": 3280 }, { "epoch": 2.632, "grad_norm": 3.171875, "learning_rate": 3.912088225473537e-06, "loss": 0.3572, "step": 3290 }, { "epoch": 2.64, "grad_norm": 4.59375, "learning_rate": 3.7463364896675735e-06, "loss": 0.3092, "step": 3300 }, { "epoch": 2.648, "grad_norm": 4.03125, "learning_rate": 3.584035846200201e-06, "loss": 0.3093, "step": 3310 }, { "epoch": 2.656, "grad_norm": 3.8125, "learning_rate": 3.425198404696178e-06, "loss": 0.3035, "step": 3320 }, { "epoch": 2.664, "grad_norm": 4.90625, "learning_rate": 3.2698360163827325e-06, "loss": 0.3166, "step": 3330 }, { "epoch": 2.672, "grad_norm": 4.03125, "learning_rate": 3.1179602732053947e-06, "loss": 0.2739, "step": 3340 }, { "epoch": 2.68, "grad_norm": 6.4375, "learning_rate": 2.969582506963098e-06, "loss": 0.3551, "step": 3350 }, { "epoch": 2.6879999999999997, "grad_norm": 4.625, "learning_rate": 2.824713788462602e-06, "loss": 0.3293, "step": 3360 }, { "epoch": 2.6959999999999997, "grad_norm": 3.71875, "learning_rate": 2.6833649266925943e-06, "loss": 0.3278, "step": 3370 }, { "epoch": 2.7039999999999997, "grad_norm": 3.984375, "learning_rate": 2.5455464680171126e-06, "loss": 0.2763, "step": 3380 }, { "epoch": 2.7119999999999997, "grad_norm": 6.125, "learning_rate": 2.411268695388719e-06, "loss": 0.3378, "step": 3390 }, { "epoch": 2.7199999999999998, "grad_norm": 5.09375, "learning_rate": 2.28054162758119e-06, "loss": 0.2644, "step": 3400 }, { "epoch": 2.7279999999999998, "grad_norm": 4.09375, "learning_rate": 2.1533750184420832e-06, "loss": 0.3154, "step": 3410 }, { "epoch": 2.7359999999999998, "grad_norm": 5.28125, "learning_rate": 2.0297783561649244e-06, "loss": 0.2217, "step": 3420 }, { "epoch": 2.7439999999999998, "grad_norm": 5.59375, "learning_rate": 1.9097608625812726e-06, "loss": 0.3446, "step": 3430 }, { "epoch": 2.752, "grad_norm": 5.25, "learning_rate": 1.7933314924726886e-06, "loss": 0.387, "step": 3440 }, { "epoch": 2.76, "grad_norm": 5.1875, "learning_rate": 1.6804989329025521e-06, "loss": 0.3531, "step": 3450 }, { "epoch": 2.768, "grad_norm": 5.9375, "learning_rate": 1.5712716025679587e-06, "loss": 0.2906, "step": 3460 }, { "epoch": 2.776, "grad_norm": 5.28125, "learning_rate": 1.4656576511715204e-06, "loss": 0.2759, "step": 3470 }, { "epoch": 2.784, "grad_norm": 4.0625, "learning_rate": 1.3636649588133432e-06, "loss": 0.3646, "step": 3480 }, { "epoch": 2.792, "grad_norm": 4.4375, "learning_rate": 1.265301135403052e-06, "loss": 0.3467, "step": 3490 }, { "epoch": 2.8, "grad_norm": 2.71875, "learning_rate": 1.1705735200920053e-06, "loss": 0.2817, "step": 3500 }, { "epoch": 2.808, "grad_norm": 5.4375, "learning_rate": 1.0794891807256956e-06, "loss": 0.3304, "step": 3510 }, { "epoch": 2.816, "grad_norm": 3.53125, "learning_rate": 9.920549133164314e-07, "loss": 0.3544, "step": 3520 }, { "epoch": 2.824, "grad_norm": 6.21875, "learning_rate": 9.08277241536215e-07, "loss": 0.3082, "step": 3530 }, { "epoch": 2.832, "grad_norm": 3.984375, "learning_rate": 8.281624162300494e-07, "loss": 0.2201, "step": 3540 }, { "epoch": 2.84, "grad_norm": 6.9375, "learning_rate": 7.517164149495326e-07, "loss": 0.2885, "step": 3550 }, { "epoch": 2.848, "grad_norm": 5.625, "learning_rate": 6.789449415068316e-07, "loss": 0.2716, "step": 3560 }, { "epoch": 2.856, "grad_norm": 5.46875, "learning_rate": 6.098534255491561e-07, "loss": 0.2723, "step": 3570 }, { "epoch": 2.864, "grad_norm": 2.765625, "learning_rate": 5.44447022153588e-07, "loss": 0.3245, "step": 3580 }, { "epoch": 2.872, "grad_norm": 5.34375, "learning_rate": 4.827306114425056e-07, "loss": 0.2905, "step": 3590 }, { "epoch": 2.88, "grad_norm": 5.59375, "learning_rate": 4.2470879821941423e-07, "loss": 0.3986, "step": 3600 }, { "epoch": 2.888, "grad_norm": 3.203125, "learning_rate": 3.703859116254038e-07, "loss": 0.3328, "step": 3610 }, { "epoch": 2.896, "grad_norm": 5.875, "learning_rate": 3.197660048161133e-07, "loss": 0.2893, "step": 3620 }, { "epoch": 2.904, "grad_norm": 4.875, "learning_rate": 2.728528546593667e-07, "loss": 0.3573, "step": 3630 }, { "epoch": 2.912, "grad_norm": 3.921875, "learning_rate": 2.2964996145330986e-07, "loss": 0.2721, "step": 3640 }, { "epoch": 2.92, "grad_norm": 3.296875, "learning_rate": 1.9016054866528576e-07, "loss": 0.2943, "step": 3650 }, { "epoch": 2.928, "grad_norm": 3.84375, "learning_rate": 1.5438756269130495e-07, "loss": 0.3179, "step": 3660 }, { "epoch": 2.936, "grad_norm": 3.453125, "learning_rate": 1.223336726362323e-07, "loss": 0.3203, "step": 3670 }, { "epoch": 2.944, "grad_norm": 3.34375, "learning_rate": 9.400127011461312e-08, "loss": 0.3184, "step": 3680 }, { "epoch": 2.952, "grad_norm": 4.625, "learning_rate": 6.939246907222696e-08, "loss": 0.3581, "step": 3690 }, { "epoch": 2.96, "grad_norm": 4.5, "learning_rate": 4.850910562839151e-08, "loss": 0.3222, "step": 3700 }, { "epoch": 2.968, "grad_norm": 6.59375, "learning_rate": 3.135273793893889e-08, "loss": 0.2907, "step": 3710 }, { "epoch": 2.976, "grad_norm": 3.5625, "learning_rate": 1.7924646079964248e-08, "loss": 0.3589, "step": 3720 }, { "epoch": 2.984, "grad_norm": 4.625, "learning_rate": 8.225831952324292e-09, "loss": 0.3172, "step": 3730 }, { "epoch": 2.992, "grad_norm": 5.75, "learning_rate": 2.257019206874933e-09, "loss": 0.29, "step": 3740 }, { "epoch": 3.0, "grad_norm": 3.109375, "learning_rate": 1.8653190470008242e-11, "loss": 0.2897, "step": 3750 }, { "epoch": 3.0, "step": 3750, "total_flos": 6.386198055566157e+17, "train_loss": 0.40305233942667645, "train_runtime": 8900.4049, "train_samples_per_second": 6.741, "train_steps_per_second": 0.421 } ], "logging_steps": 10, "max_steps": 3750, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.386198055566157e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }