| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 3750, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.008, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 7.964601769911505e-06, | |
| "loss": 1.4112, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.016, | |
| "grad_norm": 5.3125, | |
| "learning_rate": 1.6814159292035402e-05, | |
| "loss": 1.4365, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.024, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 2.5663716814159294e-05, | |
| "loss": 1.2118, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 3.451327433628319e-05, | |
| "loss": 1.1222, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 4.3362831858407084e-05, | |
| "loss": 1.0566, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.048, | |
| "grad_norm": 2.75, | |
| "learning_rate": 5.221238938053098e-05, | |
| "loss": 1.0392, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.056, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 6.106194690265487e-05, | |
| "loss": 1.1753, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 6.991150442477876e-05, | |
| "loss": 1.0422, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.072, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 7.876106194690266e-05, | |
| "loss": 1.026, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 8.761061946902655e-05, | |
| "loss": 1.0633, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.088, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 9.646017699115044e-05, | |
| "loss": 1.1138, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 9.999932848660433e-05, | |
| "loss": 1.1079, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.104, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 9.99952248589506e-05, | |
| "loss": 1.0538, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.112, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 9.998739097245067e-05, | |
| "loss": 0.9637, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 5.6875, | |
| "learning_rate": 9.997582741160886e-05, | |
| "loss": 0.8554, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 9.99605350392091e-05, | |
| "loss": 0.6704, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.136, | |
| "grad_norm": 7.03125, | |
| "learning_rate": 9.994151499625049e-05, | |
| "loss": 0.8075, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.144, | |
| "grad_norm": 3.828125, | |
| "learning_rate": 9.991876870186222e-05, | |
| "loss": 0.7331, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.152, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 9.98922978531977e-05, | |
| "loss": 0.7264, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 9.986210442530788e-05, | |
| "loss": 0.5792, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.168, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 9.982819067099396e-05, | |
| "loss": 0.6228, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.176, | |
| "grad_norm": 5.875, | |
| "learning_rate": 9.979055912063925e-05, | |
| "loss": 0.7417, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.184, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 9.974921258202036e-05, | |
| "loss": 0.472, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 9.970415414009773e-05, | |
| "loss": 0.6284, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 5.1875, | |
| "learning_rate": 9.965538715678548e-05, | |
| "loss": 0.5349, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.208, | |
| "grad_norm": 4.59375, | |
| "learning_rate": 9.960291527070051e-05, | |
| "loss": 0.5165, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.216, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 9.954674239689109e-05, | |
| "loss": 0.5656, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 3.875, | |
| "learning_rate": 9.948687272654464e-05, | |
| "loss": 0.6713, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.232, | |
| "grad_norm": 5.375, | |
| "learning_rate": 9.942331072667517e-05, | |
| "loss": 0.4347, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 5.875, | |
| "learning_rate": 9.935606113978981e-05, | |
| "loss": 0.4404, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.248, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 9.92851289835351e-05, | |
| "loss": 0.4865, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 6.46875, | |
| "learning_rate": 9.921051955032253e-05, | |
| "loss": 0.5393, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.264, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 9.913223840693375e-05, | |
| "loss": 0.5358, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.272, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 9.905029139410508e-05, | |
| "loss": 0.5756, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 10.625, | |
| "learning_rate": 9.896468462609186e-05, | |
| "loss": 0.4554, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 3.6875, | |
| "learning_rate": 9.887542449021214e-05, | |
| "loss": 0.3889, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.296, | |
| "grad_norm": 6.25, | |
| "learning_rate": 9.878251764637023e-05, | |
| "loss": 0.4468, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.304, | |
| "grad_norm": 5.5, | |
| "learning_rate": 9.868597102655968e-05, | |
| "loss": 0.4612, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.312, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 9.858579183434605e-05, | |
| "loss": 0.6069, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.84375, | |
| "learning_rate": 9.848198754432959e-05, | |
| "loss": 0.4464, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.328, | |
| "grad_norm": 5.125, | |
| "learning_rate": 9.837456590158738e-05, | |
| "loss": 0.5598, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.336, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 9.826353492109555e-05, | |
| "loss": 0.4534, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.344, | |
| "grad_norm": 7.34375, | |
| "learning_rate": 9.814890288713121e-05, | |
| "loss": 0.5155, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 6.875, | |
| "learning_rate": 9.803067835265436e-05, | |
| "loss": 0.484, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 3.875, | |
| "learning_rate": 9.790887013866973e-05, | |
| "loss": 0.4077, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.368, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 9.778348733356868e-05, | |
| "loss": 0.3779, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.376, | |
| "grad_norm": 5.34375, | |
| "learning_rate": 9.765453929245096e-05, | |
| "loss": 0.521, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 4.8125, | |
| "learning_rate": 9.752203563642688e-05, | |
| "loss": 0.4114, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.392, | |
| "grad_norm": 8.25, | |
| "learning_rate": 9.73859862518993e-05, | |
| "loss": 0.4361, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 9.724640128982605e-05, | |
| "loss": 0.518, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.408, | |
| "grad_norm": 5.5625, | |
| "learning_rate": 9.710329116496259e-05, | |
| "loss": 0.4413, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 9.695666655508483e-05, | |
| "loss": 0.4276, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.424, | |
| "grad_norm": 5.375, | |
| "learning_rate": 9.680653840019259e-05, | |
| "loss": 0.4476, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.432, | |
| "grad_norm": 5.875, | |
| "learning_rate": 9.665291790169311e-05, | |
| "loss": 0.3562, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 6.4375, | |
| "learning_rate": 9.649581652156559e-05, | |
| "loss": 0.4511, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 5.1875, | |
| "learning_rate": 9.633524598150568e-05, | |
| "loss": 0.3985, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.456, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 9.617121826205116e-05, | |
| "loss": 0.5117, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.464, | |
| "grad_norm": 5.5, | |
| "learning_rate": 9.600374560168783e-05, | |
| "loss": 0.4569, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.472, | |
| "grad_norm": 4.5, | |
| "learning_rate": 9.583284049593652e-05, | |
| "loss": 0.449, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 4.25, | |
| "learning_rate": 9.56585156964207e-05, | |
| "loss": 0.4769, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.488, | |
| "grad_norm": 5.9375, | |
| "learning_rate": 9.548078420991506e-05, | |
| "loss": 0.5081, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.496, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 9.529965929737506e-05, | |
| "loss": 0.4803, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.504, | |
| "grad_norm": 4.59375, | |
| "learning_rate": 9.511515447294748e-05, | |
| "loss": 0.5015, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 6.71875, | |
| "learning_rate": 9.49272835029621e-05, | |
| "loss": 0.5174, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 4.875, | |
| "learning_rate": 9.47360604049046e-05, | |
| "loss": 0.4957, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.528, | |
| "grad_norm": 4.71875, | |
| "learning_rate": 9.454149944637064e-05, | |
| "loss": 0.379, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.536, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 9.434361514400132e-05, | |
| "loss": 0.4857, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 9.414242226240012e-05, | |
| "loss": 0.4595, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.552, | |
| "grad_norm": 6.8125, | |
| "learning_rate": 9.393793581303116e-05, | |
| "loss": 0.4157, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 5.03125, | |
| "learning_rate": 9.37301710530993e-05, | |
| "loss": 0.4432, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.568, | |
| "grad_norm": 6.75, | |
| "learning_rate": 9.351914348441169e-05, | |
| "loss": 0.4695, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 9.330486885222114e-05, | |
| "loss": 0.3493, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.584, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 9.308736314405134e-05, | |
| "loss": 0.4304, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.592, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 9.286664258850402e-05, | |
| "loss": 0.5057, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 4.375, | |
| "learning_rate": 9.264272365404805e-05, | |
| "loss": 0.4159, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 9.241562304779072e-05, | |
| "loss": 0.3647, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.616, | |
| "grad_norm": 10.3125, | |
| "learning_rate": 9.21853577142312e-05, | |
| "loss": 0.4851, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.624, | |
| "grad_norm": 6.3125, | |
| "learning_rate": 9.195194483399625e-05, | |
| "loss": 0.5071, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.632, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 9.17154018225583e-05, | |
| "loss": 0.3939, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 9.147574632893611e-05, | |
| "loss": 0.3762, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.648, | |
| "grad_norm": 3.1875, | |
| "learning_rate": 9.12329962343779e-05, | |
| "loss": 0.4427, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.656, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 9.098716965102716e-05, | |
| "loss": 0.357, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.664, | |
| "grad_norm": 8.25, | |
| "learning_rate": 9.073828492057133e-05, | |
| "loss": 0.4071, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 9.048636061287325e-05, | |
| "loss": 0.4037, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 9.023141552458559e-05, | |
| "loss": 0.3884, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.688, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 8.997346867774839e-05, | |
| "loss": 0.3641, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.696, | |
| "grad_norm": 7.875, | |
| "learning_rate": 8.97125393183699e-05, | |
| "loss": 0.5387, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 8.94486469149904e-05, | |
| "loss": 0.4085, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.712, | |
| "grad_norm": 6.03125, | |
| "learning_rate": 8.918181115722976e-05, | |
| "loss": 0.4055, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 7.375, | |
| "learning_rate": 8.891205195431831e-05, | |
| "loss": 0.42, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.728, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 8.863938943361128e-05, | |
| "loss": 0.3372, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 8.836384393908721e-05, | |
| "loss": 0.4544, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.744, | |
| "grad_norm": 3.703125, | |
| "learning_rate": 8.808543602982993e-05, | |
| "loss": 0.4979, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.752, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 8.780418647849458e-05, | |
| "loss": 0.3366, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 8.752011626975781e-05, | |
| "loss": 0.3778, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 8.723324659875201e-05, | |
| "loss": 0.4498, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.776, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 8.694359886948384e-05, | |
| "loss": 0.4232, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.784, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 8.665119469323737e-05, | |
| "loss": 0.3602, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.792, | |
| "grad_norm": 5.375, | |
| "learning_rate": 8.635605588696148e-05, | |
| "loss": 0.4095, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 8.605820447164206e-05, | |
| "loss": 0.405, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.808, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 8.575766267065905e-05, | |
| "loss": 0.3137, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.816, | |
| "grad_norm": 5.65625, | |
| "learning_rate": 8.54544529081283e-05, | |
| "loss": 0.3701, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.824, | |
| "grad_norm": 5.0, | |
| "learning_rate": 8.514859780722833e-05, | |
| "loss": 0.3759, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 4.9375, | |
| "learning_rate": 8.484012018851246e-05, | |
| "loss": 0.3032, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 4.375, | |
| "learning_rate": 8.452904306820618e-05, | |
| "loss": 0.4171, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.848, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 8.421538965648966e-05, | |
| "loss": 0.374, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.856, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 8.389918335576623e-05, | |
| "loss": 0.3358, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 8.358044775891605e-05, | |
| "loss": 0.3586, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.872, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 8.325920664753595e-05, | |
| "loss": 0.4036, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 4.625, | |
| "learning_rate": 8.293548399016491e-05, | |
| "loss": 0.3673, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.888, | |
| "grad_norm": 2.625, | |
| "learning_rate": 8.260930394049583e-05, | |
| "loss": 0.3388, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 4.78125, | |
| "learning_rate": 8.228069083557328e-05, | |
| "loss": 0.427, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.904, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 8.194966919397767e-05, | |
| "loss": 0.3926, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.912, | |
| "grad_norm": 5.84375, | |
| "learning_rate": 8.161626371399591e-05, | |
| "loss": 0.3654, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 8.128049927177854e-05, | |
| "loss": 0.4047, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 5.0, | |
| "learning_rate": 8.094240091948375e-05, | |
| "loss": 0.4114, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.936, | |
| "grad_norm": 6.625, | |
| "learning_rate": 8.06019938834081e-05, | |
| "loss": 0.4485, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.944, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 8.025930356210439e-05, | |
| "loss": 0.3833, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.952, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 7.991435552448657e-05, | |
| "loss": 0.3742, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 7.956717550792199e-05, | |
| "loss": 0.3284, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.968, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 7.921778941631113e-05, | |
| "loss": 0.3862, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.976, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 7.886622331815477e-05, | |
| "loss": 0.4144, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.984, | |
| "grad_norm": 3.875, | |
| "learning_rate": 7.851250344460902e-05, | |
| "loss": 0.3654, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 7.815665618752812e-05, | |
| "loss": 0.3808, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 7.77987080974953e-05, | |
| "loss": 0.3482, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.008, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 7.743868588184176e-05, | |
| "loss": 0.3312, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 1.016, | |
| "grad_norm": 4.375, | |
| "learning_rate": 7.707661640265401e-05, | |
| "loss": 0.37, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 1.024, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 7.67125266747696e-05, | |
| "loss": 0.3253, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 1.032, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 7.634644386376149e-05, | |
| "loss": 0.4361, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "grad_norm": 3.953125, | |
| "learning_rate": 7.597839528391114e-05, | |
| "loss": 0.3981, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.048, | |
| "grad_norm": 7.6875, | |
| "learning_rate": 7.560840839617056e-05, | |
| "loss": 0.3634, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 1.056, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 7.523651080611341e-05, | |
| "loss": 0.3653, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 1.064, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 7.48627302618752e-05, | |
| "loss": 0.3433, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 1.072, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 7.448709465208299e-05, | |
| "loss": 0.3587, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "grad_norm": 5.375, | |
| "learning_rate": 7.410963200377458e-05, | |
| "loss": 0.346, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 1.088, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 7.373037048030731e-05, | |
| "loss": 0.4562, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 1.096, | |
| "grad_norm": 3.75, | |
| "learning_rate": 7.334933837925675e-05, | |
| "loss": 0.4333, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 1.104, | |
| "grad_norm": 3.46875, | |
| "learning_rate": 7.296656413030531e-05, | |
| "loss": 0.306, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 1.112, | |
| "grad_norm": 5.25, | |
| "learning_rate": 7.25820762931211e-05, | |
| "loss": 0.4095, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 1.12, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 7.219590355522697e-05, | |
| "loss": 0.369, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.1280000000000001, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 7.180807472986009e-05, | |
| "loss": 0.3763, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 1.1360000000000001, | |
| "grad_norm": 5.25, | |
| "learning_rate": 7.141861875382215e-05, | |
| "loss": 0.4269, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 1.144, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 7.102756468532027e-05, | |
| "loss": 0.4017, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 1.152, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 7.063494170179898e-05, | |
| "loss": 0.3601, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 7.024077909776309e-05, | |
| "loss": 0.3678, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 1.168, | |
| "grad_norm": 5.25, | |
| "learning_rate": 6.984510628259212e-05, | |
| "loss": 0.3732, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 1.176, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 6.94479527783459e-05, | |
| "loss": 0.3332, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 1.184, | |
| "grad_norm": 4.5, | |
| "learning_rate": 6.904934821756184e-05, | |
| "loss": 0.3887, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 1.192, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 6.864932234104409e-05, | |
| "loss": 0.3196, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 6.824790499564435e-05, | |
| "loss": 0.3256, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.208, | |
| "grad_norm": 3.6875, | |
| "learning_rate": 6.784512613203511e-05, | |
| "loss": 0.3074, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 1.216, | |
| "grad_norm": 3.0, | |
| "learning_rate": 6.744101580247481e-05, | |
| "loss": 0.35, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 1.224, | |
| "grad_norm": 3.828125, | |
| "learning_rate": 6.703560415856565e-05, | |
| "loss": 0.3731, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 1.232, | |
| "grad_norm": 5.625, | |
| "learning_rate": 6.662892144900388e-05, | |
| "loss": 0.3769, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 1.24, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 6.62209980173229e-05, | |
| "loss": 0.4385, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.248, | |
| "grad_norm": 5.9375, | |
| "learning_rate": 6.581186429962922e-05, | |
| "loss": 0.3563, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.256, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 6.54015508223316e-05, | |
| "loss": 0.3225, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.264, | |
| "grad_norm": 4.125, | |
| "learning_rate": 6.499008819986339e-05, | |
| "loss": 0.3246, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.272, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 6.457750713239828e-05, | |
| "loss": 0.304, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.28, | |
| "grad_norm": 4.75, | |
| "learning_rate": 6.41638384035597e-05, | |
| "loss": 0.383, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.288, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 6.374911287812406e-05, | |
| "loss": 0.331, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.296, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 6.333336149971776e-05, | |
| "loss": 0.3022, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.304, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 6.291661528850844e-05, | |
| "loss": 0.3257, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.312, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 6.249890533889054e-05, | |
| "loss": 0.3071, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 4.25, | |
| "learning_rate": 6.208026281716521e-05, | |
| "loss": 0.3833, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.328, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 6.166071895921496e-05, | |
| "loss": 0.3378, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.336, | |
| "grad_norm": 3.5, | |
| "learning_rate": 6.124030506817309e-05, | |
| "loss": 0.31, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.3439999999999999, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 6.0819052512088057e-05, | |
| "loss": 0.3139, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 6.039699272158305e-05, | |
| "loss": 0.4388, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.3599999999999999, | |
| "grad_norm": 5.59375, | |
| "learning_rate": 5.997415718751086e-05, | |
| "loss": 0.3989, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.3679999999999999, | |
| "grad_norm": 5.5, | |
| "learning_rate": 5.955057745860435e-05, | |
| "loss": 0.3977, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.376, | |
| "grad_norm": 6.25, | |
| "learning_rate": 5.9126285139122406e-05, | |
| "loss": 0.3527, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.384, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 5.8701311886491947e-05, | |
| "loss": 0.4044, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.392, | |
| "grad_norm": 5.5625, | |
| "learning_rate": 5.827568940894593e-05, | |
| "loss": 0.374, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 3.625, | |
| "learning_rate": 5.7849449463157435e-05, | |
| "loss": 0.3479, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.408, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 5.742262385187028e-05, | |
| "loss": 0.3666, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.416, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 5.699524442152613e-05, | |
| "loss": 0.3707, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.424, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 5.656734305988839e-05, | |
| "loss": 0.3847, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.432, | |
| "grad_norm": 3.96875, | |
| "learning_rate": 5.613895169366292e-05, | |
| "loss": 0.3515, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.44, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 5.571010228611597e-05, | |
| "loss": 0.3763, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.448, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 5.528082683468934e-05, | |
| "loss": 0.3548, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.456, | |
| "grad_norm": 5.03125, | |
| "learning_rate": 5.485115736861288e-05, | |
| "loss": 0.3903, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.464, | |
| "grad_norm": 4.375, | |
| "learning_rate": 5.442112594651484e-05, | |
| "loss": 0.257, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.472, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 5.399076465402979e-05, | |
| "loss": 0.3424, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 3.40625, | |
| "learning_rate": 5.356010560140475e-05, | |
| "loss": 0.3317, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.488, | |
| "grad_norm": 5.0, | |
| "learning_rate": 5.312918092110325e-05, | |
| "loss": 0.2753, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.496, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 5.269802276540795e-05, | |
| "loss": 0.3318, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.504, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 5.226666330402164e-05, | |
| "loss": 0.3836, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.512, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 5.1835134721666956e-05, | |
| "loss": 0.3498, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.52, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 5.1403469215685094e-05, | |
| "loss": 0.4228, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.528, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 5.097169899363342e-05, | |
| "loss": 0.3703, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.536, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 5.053985627088238e-05, | |
| "loss": 0.3816, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.544, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 5.010797326821189e-05, | |
| "loss": 0.3842, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.552, | |
| "grad_norm": 3.875, | |
| "learning_rate": 4.9676082209407254e-05, | |
| "loss": 0.3848, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.56, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 4.924421531885481e-05, | |
| "loss": 0.3416, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.568, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 4.881240481913773e-05, | |
| "loss": 0.3407, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.576, | |
| "grad_norm": 3.921875, | |
| "learning_rate": 4.838068292863164e-05, | |
| "loss": 0.3319, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.584, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 4.7949081859100896e-05, | |
| "loss": 0.3979, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.592, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 4.7517633813295114e-05, | |
| "loss": 0.4184, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 3.890625, | |
| "learning_rate": 4.708637098254644e-05, | |
| "loss": 0.3959, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.608, | |
| "grad_norm": 4.8125, | |
| "learning_rate": 4.6655325544367715e-05, | |
| "loss": 0.313, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.616, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 4.6224529660051593e-05, | |
| "loss": 0.3012, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.624, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 4.579401547227096e-05, | |
| "loss": 0.2532, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.6320000000000001, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 4.53638151026807e-05, | |
| "loss": 0.2714, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.6400000000000001, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 4.493396064952093e-05, | |
| "loss": 0.3468, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.6480000000000001, | |
| "grad_norm": 5.125, | |
| "learning_rate": 4.450448418522221e-05, | |
| "loss": 0.4547, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.6560000000000001, | |
| "grad_norm": 3.5, | |
| "learning_rate": 4.4075417754012475e-05, | |
| "loss": 0.2839, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.6640000000000001, | |
| "grad_norm": 5.84375, | |
| "learning_rate": 4.364679336952609e-05, | |
| "loss": 0.3426, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 4.321864301241535e-05, | |
| "loss": 0.3325, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.6800000000000002, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 4.279099862796427e-05, | |
| "loss": 0.314, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.688, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 4.23638921237051e-05, | |
| "loss": 0.4189, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.696, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 4.1937355367037516e-05, | |
| "loss": 0.3436, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.704, | |
| "grad_norm": 6.21875, | |
| "learning_rate": 4.151142018285112e-05, | |
| "loss": 0.3681, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.712, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 4.1086118351150785e-05, | |
| "loss": 0.3716, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 4.066148160468543e-05, | |
| "loss": 0.2761, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.728, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 4.023754162658051e-05, | |
| "loss": 0.2904, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.736, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 3.981433004797395e-05, | |
| "loss": 0.3563, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.744, | |
| "grad_norm": 4.125, | |
| "learning_rate": 3.939187844565616e-05, | |
| "loss": 0.3248, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.752, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 3.897021833971386e-05, | |
| "loss": 0.3246, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.76, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 3.8549381191178516e-05, | |
| "loss": 0.4073, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.768, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 3.8129398399678814e-05, | |
| "loss": 0.4147, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.776, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 3.771030130109785e-05, | |
| "loss": 0.2378, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.784, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 3.729212116523518e-05, | |
| "loss": 0.3305, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.792, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 3.6874889193473646e-05, | |
| "loss": 0.3865, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 4.5, | |
| "learning_rate": 3.64586365164514e-05, | |
| "loss": 0.3443, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.808, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 3.604339419173912e-05, | |
| "loss": 0.2762, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.8159999999999998, | |
| "grad_norm": 5.15625, | |
| "learning_rate": 3.5629193201522794e-05, | |
| "loss": 0.3787, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.8239999999999998, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 3.521606445029208e-05, | |
| "loss": 0.4157, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": 5.34375, | |
| "learning_rate": 3.480403876253432e-05, | |
| "loss": 0.3345, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.8399999999999999, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 3.4393146880434845e-05, | |
| "loss": 0.3111, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.8479999999999999, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 3.398341946158311e-05, | |
| "loss": 0.3763, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.8559999999999999, | |
| "grad_norm": 3.703125, | |
| "learning_rate": 3.357488707668529e-05, | |
| "loss": 0.3246, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 3.316758020728327e-05, | |
| "loss": 0.3852, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.8719999999999999, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 3.276152924348046e-05, | |
| "loss": 0.3295, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.88, | |
| "grad_norm": 5.15625, | |
| "learning_rate": 3.2356764481674254e-05, | |
| "loss": 0.3567, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.888, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 3.1953316122295554e-05, | |
| "loss": 0.3091, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.896, | |
| "grad_norm": 3.265625, | |
| "learning_rate": 3.1551214267555416e-05, | |
| "loss": 0.3847, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.904, | |
| "grad_norm": 6.09375, | |
| "learning_rate": 3.1150488919199124e-05, | |
| "loss": 0.3958, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.912, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 3.075116997626764e-05, | |
| "loss": 0.384, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.92, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 3.0353287232866736e-05, | |
| "loss": 0.3349, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.928, | |
| "grad_norm": 4.375, | |
| "learning_rate": 2.995687037594408e-05, | |
| "loss": 0.3801, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.936, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 2.9561948983074174e-05, | |
| "loss": 0.3281, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.944, | |
| "grad_norm": 6.8125, | |
| "learning_rate": 2.916855252025149e-05, | |
| "loss": 0.3549, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.952, | |
| "grad_norm": 4.125, | |
| "learning_rate": 2.877671033969193e-05, | |
| "loss": 0.4092, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.96, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 2.8386451677642878e-05, | |
| "loss": 0.3866, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.968, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 2.7997805652201714e-05, | |
| "loss": 0.3484, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.976, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 2.7610801261143283e-05, | |
| "loss": 0.3496, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.984, | |
| "grad_norm": 4.6875, | |
| "learning_rate": 2.7225467379756314e-05, | |
| "loss": 0.3691, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.992, | |
| "grad_norm": 5.5, | |
| "learning_rate": 2.6841832758689002e-05, | |
| "loss": 0.3698, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 3.875, | |
| "learning_rate": 2.645992602180377e-05, | |
| "loss": 0.3577, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.008, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 2.607977566404164e-05, | |
| "loss": 0.2871, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 2.016, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 2.570141004929612e-05, | |
| "loss": 0.3426, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 2.024, | |
| "grad_norm": 3.5, | |
| "learning_rate": 2.5324857408296994e-05, | |
| "loss": 0.2656, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 2.032, | |
| "grad_norm": 3.578125, | |
| "learning_rate": 2.4950145836503836e-05, | |
| "loss": 0.3473, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 2.04, | |
| "grad_norm": 3.953125, | |
| "learning_rate": 2.4577303292009822e-05, | |
| "loss": 0.3588, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 2.048, | |
| "grad_norm": 4.75, | |
| "learning_rate": 2.4206357593455743e-05, | |
| "loss": 0.3953, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 2.056, | |
| "grad_norm": 3.96875, | |
| "learning_rate": 2.383733641795428e-05, | |
| "loss": 0.3209, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 2.3470267299025068e-05, | |
| "loss": 0.3299, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 2.072, | |
| "grad_norm": 3.375, | |
| "learning_rate": 2.3105177624540252e-05, | |
| "loss": 0.2311, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 2.274209463468117e-05, | |
| "loss": 0.3035, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.088, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 2.2381045419905655e-05, | |
| "loss": 0.3344, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 2.096, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 2.2022056918927037e-05, | |
| "loss": 0.2794, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 2.104, | |
| "grad_norm": 5.25, | |
| "learning_rate": 2.166515591670394e-05, | |
| "loss": 0.3416, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 2.112, | |
| "grad_norm": 3.96875, | |
| "learning_rate": 2.1310369042441985e-05, | |
| "loss": 0.3152, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 2.12, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 2.0957722767606774e-05, | |
| "loss": 0.3015, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 2.128, | |
| "grad_norm": 3.78125, | |
| "learning_rate": 2.0607243403948863e-05, | |
| "loss": 0.3843, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 2.136, | |
| "grad_norm": 6.34375, | |
| "learning_rate": 2.0258957101540625e-05, | |
| "loss": 0.3299, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 1.9912889846825038e-05, | |
| "loss": 0.3636, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 2.152, | |
| "grad_norm": 3.6875, | |
| "learning_rate": 1.956906746067683e-05, | |
| "loss": 0.3596, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 2.16, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 1.922751559647591e-05, | |
| "loss": 0.3796, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.168, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 1.888825973819336e-05, | |
| "loss": 0.3175, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 2.176, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 1.8551325198489887e-05, | |
| "loss": 0.2928, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 2.184, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 1.8216737116827378e-05, | |
| "loss": 0.2791, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 2.192, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 1.7884520457592984e-05, | |
| "loss": 0.3925, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 3.28125, | |
| "learning_rate": 1.755470000823667e-05, | |
| "loss": 0.2967, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 2.208, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 1.7227300377421574e-05, | |
| "loss": 0.2475, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 2.216, | |
| "grad_norm": 3.515625, | |
| "learning_rate": 1.6902345993188017e-05, | |
| "loss": 0.34, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 1.6579861101130896e-05, | |
| "loss": 0.3418, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 2.232, | |
| "grad_norm": 6.5, | |
| "learning_rate": 1.6259869762590503e-05, | |
| "loss": 0.4639, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 2.24, | |
| "grad_norm": 5.0625, | |
| "learning_rate": 1.5942395852857466e-05, | |
| "loss": 0.4252, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.248, | |
| "grad_norm": 4.71875, | |
| "learning_rate": 1.5627463059391173e-05, | |
| "loss": 0.3562, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 3.78125, | |
| "learning_rate": 1.531509488005257e-05, | |
| "loss": 0.2792, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 2.2640000000000002, | |
| "grad_norm": 5.25, | |
| "learning_rate": 1.5005314621350709e-05, | |
| "loss": 0.2659, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 2.2720000000000002, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 1.4698145396704044e-05, | |
| "loss": 0.2647, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 2.2800000000000002, | |
| "grad_norm": 5.3125, | |
| "learning_rate": 1.4393610124715696e-05, | |
| "loss": 0.2826, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 2.288, | |
| "grad_norm": 4.59375, | |
| "learning_rate": 1.4091731527463526e-05, | |
| "loss": 0.2643, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 2.296, | |
| "grad_norm": 5.75, | |
| "learning_rate": 1.3792532128804803e-05, | |
| "loss": 0.3758, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 6.25, | |
| "learning_rate": 1.3496034252695599e-05, | |
| "loss": 0.3, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 2.312, | |
| "grad_norm": 5.1875, | |
| "learning_rate": 1.3202260021525158e-05, | |
| "loss": 0.3376, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 1.2911231354465303e-05, | |
| "loss": 0.3686, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 2.328, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 1.262296996583504e-05, | |
| "loss": 0.3372, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 2.336, | |
| "grad_norm": 3.421875, | |
| "learning_rate": 1.2337497363480317e-05, | |
| "loss": 0.3071, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 2.344, | |
| "grad_norm": 4.15625, | |
| "learning_rate": 1.2054834847169316e-05, | |
| "loss": 0.3724, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 2.352, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 1.1775003507003236e-05, | |
| "loss": 0.2919, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 2.36, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 1.1498024221842735e-05, | |
| "loss": 0.2496, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 2.368, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 1.1223917657750033e-05, | |
| "loss": 0.265, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 2.376, | |
| "grad_norm": 6.03125, | |
| "learning_rate": 1.095270426644705e-05, | |
| "loss": 0.3083, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 4.25, | |
| "learning_rate": 1.0684404283789385e-05, | |
| "loss": 0.3392, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 2.392, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 1.0419037728256564e-05, | |
| "loss": 0.3743, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 4.5625, | |
| "learning_rate": 1.015662439945832e-05, | |
| "loss": 0.3846, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.408, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 9.89718387665734e-06, | |
| "loss": 0.2818, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 2.416, | |
| "grad_norm": 4.1875, | |
| "learning_rate": 9.640735517308435e-06, | |
| "loss": 0.3442, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 2.424, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 9.387298455614191e-06, | |
| "loss": 0.2982, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 2.432, | |
| "grad_norm": 7.5, | |
| "learning_rate": 9.136891601097347e-06, | |
| "loss": 0.3924, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 2.44, | |
| "grad_norm": 3.15625, | |
| "learning_rate": 8.889533637189895e-06, | |
| "loss": 0.2838, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 2.448, | |
| "grad_norm": 6.125, | |
| "learning_rate": 8.645243019839112e-06, | |
| "loss": 0.3035, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 2.456, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 8.404037976130458e-06, | |
| "loss": 0.3713, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 4.84375, | |
| "learning_rate": 8.16593650292764e-06, | |
| "loss": 0.3242, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 2.472, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 7.930956365529818e-06, | |
| "loss": 0.3214, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 2.48, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 7.699115096346139e-06, | |
| "loss": 0.3072, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 2.488, | |
| "grad_norm": 4.5, | |
| "learning_rate": 7.4704299935875185e-06, | |
| "loss": 0.2528, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 2.496, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 7.244918119976035e-06, | |
| "loss": 0.3366, | |
| "step": 3120 | |
| }, | |
| { | |
| "epoch": 2.504, | |
| "grad_norm": 5.53125, | |
| "learning_rate": 7.022596301471868e-06, | |
| "loss": 0.3603, | |
| "step": 3130 | |
| }, | |
| { | |
| "epoch": 2.512, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 6.803481126017808e-06, | |
| "loss": 0.2996, | |
| "step": 3140 | |
| }, | |
| { | |
| "epoch": 2.52, | |
| "grad_norm": 3.90625, | |
| "learning_rate": 6.587588942301626e-06, | |
| "loss": 0.3519, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.528, | |
| "grad_norm": 5.3125, | |
| "learning_rate": 6.374935858536257e-06, | |
| "loss": 0.2668, | |
| "step": 3160 | |
| }, | |
| { | |
| "epoch": 2.536, | |
| "grad_norm": 5.25, | |
| "learning_rate": 6.165537741257971e-06, | |
| "loss": 0.3093, | |
| "step": 3170 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": 5.0, | |
| "learning_rate": 5.959410214142419e-06, | |
| "loss": 0.3223, | |
| "step": 3180 | |
| }, | |
| { | |
| "epoch": 2.552, | |
| "grad_norm": 5.15625, | |
| "learning_rate": 5.756568656839056e-06, | |
| "loss": 0.4137, | |
| "step": 3190 | |
| }, | |
| { | |
| "epoch": 2.56, | |
| "grad_norm": 5.78125, | |
| "learning_rate": 5.557028203823522e-06, | |
| "loss": 0.3785, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.568, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 5.360803743268494e-06, | |
| "loss": 0.3343, | |
| "step": 3210 | |
| }, | |
| { | |
| "epoch": 2.576, | |
| "grad_norm": 4.96875, | |
| "learning_rate": 5.167909915932801e-06, | |
| "loss": 0.3217, | |
| "step": 3220 | |
| }, | |
| { | |
| "epoch": 2.584, | |
| "grad_norm": 3.6875, | |
| "learning_rate": 4.9783611140690415e-06, | |
| "loss": 0.3157, | |
| "step": 3230 | |
| }, | |
| { | |
| "epoch": 2.592, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 4.7921714803498165e-06, | |
| "loss": 0.2983, | |
| "step": 3240 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 5.3125, | |
| "learning_rate": 4.609354906812374e-06, | |
| "loss": 0.3362, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 2.608, | |
| "grad_norm": 3.953125, | |
| "learning_rate": 4.429925033822252e-06, | |
| "loss": 0.3844, | |
| "step": 3260 | |
| }, | |
| { | |
| "epoch": 2.616, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 4.253895249055412e-06, | |
| "loss": 0.2974, | |
| "step": 3270 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 4.0812786864994566e-06, | |
| "loss": 0.3442, | |
| "step": 3280 | |
| }, | |
| { | |
| "epoch": 2.632, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 3.912088225473537e-06, | |
| "loss": 0.3572, | |
| "step": 3290 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 4.59375, | |
| "learning_rate": 3.7463364896675735e-06, | |
| "loss": 0.3092, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.648, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 3.584035846200201e-06, | |
| "loss": 0.3093, | |
| "step": 3310 | |
| }, | |
| { | |
| "epoch": 2.656, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 3.425198404696178e-06, | |
| "loss": 0.3035, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 2.664, | |
| "grad_norm": 4.90625, | |
| "learning_rate": 3.2698360163827325e-06, | |
| "loss": 0.3166, | |
| "step": 3330 | |
| }, | |
| { | |
| "epoch": 2.672, | |
| "grad_norm": 4.03125, | |
| "learning_rate": 3.1179602732053947e-06, | |
| "loss": 0.2739, | |
| "step": 3340 | |
| }, | |
| { | |
| "epoch": 2.68, | |
| "grad_norm": 6.4375, | |
| "learning_rate": 2.969582506963098e-06, | |
| "loss": 0.3551, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 2.6879999999999997, | |
| "grad_norm": 4.625, | |
| "learning_rate": 2.824713788462602e-06, | |
| "loss": 0.3293, | |
| "step": 3360 | |
| }, | |
| { | |
| "epoch": 2.6959999999999997, | |
| "grad_norm": 3.71875, | |
| "learning_rate": 2.6833649266925943e-06, | |
| "loss": 0.3278, | |
| "step": 3370 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 2.5455464680171126e-06, | |
| "loss": 0.2763, | |
| "step": 3380 | |
| }, | |
| { | |
| "epoch": 2.7119999999999997, | |
| "grad_norm": 6.125, | |
| "learning_rate": 2.411268695388719e-06, | |
| "loss": 0.3378, | |
| "step": 3390 | |
| }, | |
| { | |
| "epoch": 2.7199999999999998, | |
| "grad_norm": 5.09375, | |
| "learning_rate": 2.28054162758119e-06, | |
| "loss": 0.2644, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.7279999999999998, | |
| "grad_norm": 4.09375, | |
| "learning_rate": 2.1533750184420832e-06, | |
| "loss": 0.3154, | |
| "step": 3410 | |
| }, | |
| { | |
| "epoch": 2.7359999999999998, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 2.0297783561649244e-06, | |
| "loss": 0.2217, | |
| "step": 3420 | |
| }, | |
| { | |
| "epoch": 2.7439999999999998, | |
| "grad_norm": 5.59375, | |
| "learning_rate": 1.9097608625812726e-06, | |
| "loss": 0.3446, | |
| "step": 3430 | |
| }, | |
| { | |
| "epoch": 2.752, | |
| "grad_norm": 5.25, | |
| "learning_rate": 1.7933314924726886e-06, | |
| "loss": 0.387, | |
| "step": 3440 | |
| }, | |
| { | |
| "epoch": 2.76, | |
| "grad_norm": 5.1875, | |
| "learning_rate": 1.6804989329025521e-06, | |
| "loss": 0.3531, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.768, | |
| "grad_norm": 5.9375, | |
| "learning_rate": 1.5712716025679587e-06, | |
| "loss": 0.2906, | |
| "step": 3460 | |
| }, | |
| { | |
| "epoch": 2.776, | |
| "grad_norm": 5.28125, | |
| "learning_rate": 1.4656576511715204e-06, | |
| "loss": 0.2759, | |
| "step": 3470 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 1.3636649588133432e-06, | |
| "loss": 0.3646, | |
| "step": 3480 | |
| }, | |
| { | |
| "epoch": 2.792, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 1.265301135403052e-06, | |
| "loss": 0.3467, | |
| "step": 3490 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 1.1705735200920053e-06, | |
| "loss": 0.2817, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.808, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 1.0794891807256956e-06, | |
| "loss": 0.3304, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 2.816, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 9.920549133164314e-07, | |
| "loss": 0.3544, | |
| "step": 3520 | |
| }, | |
| { | |
| "epoch": 2.824, | |
| "grad_norm": 6.21875, | |
| "learning_rate": 9.08277241536215e-07, | |
| "loss": 0.3082, | |
| "step": 3530 | |
| }, | |
| { | |
| "epoch": 2.832, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 8.281624162300494e-07, | |
| "loss": 0.2201, | |
| "step": 3540 | |
| }, | |
| { | |
| "epoch": 2.84, | |
| "grad_norm": 6.9375, | |
| "learning_rate": 7.517164149495326e-07, | |
| "loss": 0.2885, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.848, | |
| "grad_norm": 5.625, | |
| "learning_rate": 6.789449415068316e-07, | |
| "loss": 0.2716, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 2.856, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 6.098534255491561e-07, | |
| "loss": 0.2723, | |
| "step": 3570 | |
| }, | |
| { | |
| "epoch": 2.864, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 5.44447022153588e-07, | |
| "loss": 0.3245, | |
| "step": 3580 | |
| }, | |
| { | |
| "epoch": 2.872, | |
| "grad_norm": 5.34375, | |
| "learning_rate": 4.827306114425056e-07, | |
| "loss": 0.2905, | |
| "step": 3590 | |
| }, | |
| { | |
| "epoch": 2.88, | |
| "grad_norm": 5.59375, | |
| "learning_rate": 4.2470879821941423e-07, | |
| "loss": 0.3986, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.888, | |
| "grad_norm": 3.203125, | |
| "learning_rate": 3.703859116254038e-07, | |
| "loss": 0.3328, | |
| "step": 3610 | |
| }, | |
| { | |
| "epoch": 2.896, | |
| "grad_norm": 5.875, | |
| "learning_rate": 3.197660048161133e-07, | |
| "loss": 0.2893, | |
| "step": 3620 | |
| }, | |
| { | |
| "epoch": 2.904, | |
| "grad_norm": 4.875, | |
| "learning_rate": 2.728528546593667e-07, | |
| "loss": 0.3573, | |
| "step": 3630 | |
| }, | |
| { | |
| "epoch": 2.912, | |
| "grad_norm": 3.921875, | |
| "learning_rate": 2.2964996145330986e-07, | |
| "loss": 0.2721, | |
| "step": 3640 | |
| }, | |
| { | |
| "epoch": 2.92, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 1.9016054866528576e-07, | |
| "loss": 0.2943, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.928, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 1.5438756269130495e-07, | |
| "loss": 0.3179, | |
| "step": 3660 | |
| }, | |
| { | |
| "epoch": 2.936, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 1.223336726362323e-07, | |
| "loss": 0.3203, | |
| "step": 3670 | |
| }, | |
| { | |
| "epoch": 2.944, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 9.400127011461312e-08, | |
| "loss": 0.3184, | |
| "step": 3680 | |
| }, | |
| { | |
| "epoch": 2.952, | |
| "grad_norm": 4.625, | |
| "learning_rate": 6.939246907222696e-08, | |
| "loss": 0.3581, | |
| "step": 3690 | |
| }, | |
| { | |
| "epoch": 2.96, | |
| "grad_norm": 4.5, | |
| "learning_rate": 4.850910562839151e-08, | |
| "loss": 0.3222, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.968, | |
| "grad_norm": 6.59375, | |
| "learning_rate": 3.135273793893889e-08, | |
| "loss": 0.2907, | |
| "step": 3710 | |
| }, | |
| { | |
| "epoch": 2.976, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 1.7924646079964248e-08, | |
| "loss": 0.3589, | |
| "step": 3720 | |
| }, | |
| { | |
| "epoch": 2.984, | |
| "grad_norm": 4.625, | |
| "learning_rate": 8.225831952324292e-09, | |
| "loss": 0.3172, | |
| "step": 3730 | |
| }, | |
| { | |
| "epoch": 2.992, | |
| "grad_norm": 5.75, | |
| "learning_rate": 2.257019206874933e-09, | |
| "loss": 0.29, | |
| "step": 3740 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 1.8653190470008242e-11, | |
| "loss": 0.2897, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 3750, | |
| "total_flos": 6.386198055566157e+17, | |
| "train_loss": 0.40305233942667645, | |
| "train_runtime": 8900.4049, | |
| "train_samples_per_second": 6.741, | |
| "train_steps_per_second": 0.421 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3750, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.386198055566157e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |