| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.99904, | |
| "eval_steps": 500, | |
| "global_step": 3124, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0064, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 2.7000000000000002e-05, | |
| "loss": 1.239, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.0128, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 2.99993816906306e-05, | |
| "loss": 1.1046, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.0192, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 2.999724439035231e-05, | |
| "loss": 1.1791, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.0256, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 2.9993580683196947e-05, | |
| "loss": 1.213, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.032, | |
| "grad_norm": 1.875, | |
| "learning_rate": 2.9988390942053532e-05, | |
| "loss": 1.2061, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0384, | |
| "grad_norm": 2.125, | |
| "learning_rate": 2.998167569512954e-05, | |
| "loss": 1.1253, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.0448, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 2.997343562589714e-05, | |
| "loss": 1.1766, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 2.0, | |
| "learning_rate": 2.9963671573023613e-05, | |
| "loss": 1.1442, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0576, | |
| "grad_norm": 3.171875, | |
| "learning_rate": 2.9952384530286028e-05, | |
| "loss": 1.0799, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 2.9939575646470072e-05, | |
| "loss": 1.108, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0704, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 2.9925246225253124e-05, | |
| "loss": 1.1166, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.0768, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 2.9909397725071577e-05, | |
| "loss": 1.1037, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.0832, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 2.9892031758972416e-05, | |
| "loss": 1.1854, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.0896, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 2.9873150094449012e-05, | |
| "loss": 1.1581, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.096, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 2.9852754653261236e-05, | |
| "loss": 1.1411, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 2.9830847511239886e-05, | |
| "loss": 1.2032, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.1088, | |
| "grad_norm": 1.75, | |
| "learning_rate": 2.980743089807539e-05, | |
| "loss": 1.1435, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1152, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 2.9782507197090873e-05, | |
| "loss": 1.105, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.1216, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 2.9756078944999583e-05, | |
| "loss": 1.1362, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 2.972814883164671e-05, | |
| "loss": 1.0945, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1344, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 2.9698719699735635e-05, | |
| "loss": 1.138, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.1408, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 2.9667794544538547e-05, | |
| "loss": 1.1012, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.1472, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 2.963537651359166e-05, | |
| "loss": 1.1816, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 2.9601468906374795e-05, | |
| "loss": 1.1537, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 2.9566075173975597e-05, | |
| "loss": 1.1143, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1664, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 2.9529198918738284e-05, | |
| "loss": 1.1162, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.1728, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 2.949084389389699e-05, | |
| "loss": 1.1628, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.1792, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 2.9451014003193773e-05, | |
| "loss": 1.1051, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.1856, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 2.9409713300481292e-05, | |
| "loss": 1.117, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 2.9366945989310225e-05, | |
| "loss": 1.1111, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1984, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 2.9322716422501393e-05, | |
| "loss": 1.1349, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 2.9277029101702786e-05, | |
| "loss": 1.0544, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.2112, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 2.9229888676931357e-05, | |
| "loss": 1.1074, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.2176, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 2.9181299946099754e-05, | |
| "loss": 1.1687, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.224, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 2.9131267854527993e-05, | |
| "loss": 1.1373, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2304, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 2.9079797494440136e-05, | |
| "loss": 1.1449, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.2368, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 2.9026894104445985e-05, | |
| "loss": 1.0586, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.2432, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 2.8972563069007933e-05, | |
| "loss": 1.1321, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.2496, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 2.891680991789291e-05, | |
| "loss": 1.0835, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 2.8859640325609582e-05, | |
| "loss": 1.0869, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2624, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 2.8801060110830807e-05, | |
| "loss": 1.0757, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.2688, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 2.8741075235801406e-05, | |
| "loss": 1.077, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.2752, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 2.8679691805731334e-05, | |
| "loss": 1.1234, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.2816, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 2.8616916068174298e-05, | |
| "loss": 1.1914, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.288, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 2.8552754412391885e-05, | |
| "loss": 1.175, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2944, | |
| "grad_norm": 2.75, | |
| "learning_rate": 2.8487213368703274e-05, | |
| "loss": 1.1524, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.3008, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 2.8420299607820574e-05, | |
| "loss": 1.0483, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 2.8352019940169903e-05, | |
| "loss": 1.1059, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.3136, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 2.8282381315198205e-05, | |
| "loss": 1.2019, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 2.8211390820665962e-05, | |
| "loss": 1.1448, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.3264, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 2.8139055681925785e-05, | |
| "loss": 1.166, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.3328, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 2.8065383261187055e-05, | |
| "loss": 1.1071, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.3392, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 2.7990381056766583e-05, | |
| "loss": 1.0663, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.3456, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 2.7914056702325426e-05, | |
| "loss": 1.0808, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.352, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 2.783641796609197e-05, | |
| "loss": 1.0989, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 2.7757472750071263e-05, | |
| "loss": 1.1476, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.3648, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 2.7677229089240768e-05, | |
| "loss": 1.1165, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.3712, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 2.7595695150732566e-05, | |
| "loss": 1.1196, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.3776, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 2.75128792330021e-05, | |
| "loss": 1.1399, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 2.7428789764983587e-05, | |
| "loss": 1.1192, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3904, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 2.7343435305232116e-05, | |
| "loss": 1.1318, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.3968, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 2.7256824541052566e-05, | |
| "loss": 1.2052, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.4032, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 2.7168966287615415e-05, | |
| "loss": 1.1994, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.4096, | |
| "grad_norm": 2.0, | |
| "learning_rate": 2.7079869487059554e-05, | |
| "loss": 1.0983, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.416, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 2.6989543207582145e-05, | |
| "loss": 1.0591, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.4224, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 2.6897996642515685e-05, | |
| "loss": 1.1056, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.4288, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 2.6805239109392308e-05, | |
| "loss": 1.1212, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.4352, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 2.671128004899545e-05, | |
| "loss": 1.1195, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.4416, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 2.661612902439898e-05, | |
| "loss": 1.139, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 2.651979571999388e-05, | |
| "loss": 1.1123, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4544, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 2.6422289940502575e-05, | |
| "loss": 1.1357, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "grad_norm": 2.375, | |
| "learning_rate": 2.6323621609981003e-05, | |
| "loss": 1.136, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.4672, | |
| "grad_norm": 1.8671875, | |
| "learning_rate": 2.6223800770808576e-05, | |
| "loss": 1.1279, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.4736, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 2.6122837582666065e-05, | |
| "loss": 1.1178, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 2.6020742321501548e-05, | |
| "loss": 1.1197, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4864, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 2.5917525378484542e-05, | |
| "loss": 1.1162, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.4928, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 2.581319725894839e-05, | |
| "loss": 1.089, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.4992, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 2.5707768581321067e-05, | |
| "loss": 1.1866, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.5056, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 2.560125007604439e-05, | |
| "loss": 1.152, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 2.984375, | |
| "learning_rate": 2.5493652584481944e-05, | |
| "loss": 1.1716, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.5184, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 2.53849870578156e-05, | |
| "loss": 1.0954, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.5248, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 2.5275264555930956e-05, | |
| "loss": 1.0561, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.5312, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 2.5164496246291653e-05, | |
| "loss": 1.1518, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.5376, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 2.505269340280277e-05, | |
| "loss": 1.1361, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.544, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 2.4939867404663354e-05, | |
| "loss": 1.1038, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5504, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 2.4826029735208273e-05, | |
| "loss": 1.2142, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.5568, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 2.471119198073946e-05, | |
| "loss": 1.139, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.5632, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 2.4595365829346638e-05, | |
| "loss": 1.0655, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.5696, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 2.447856306971775e-05, | |
| "loss": 1.0564, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 2.9375, | |
| "learning_rate": 2.436079558993909e-05, | |
| "loss": 1.1195, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5824, | |
| "grad_norm": 2.921875, | |
| "learning_rate": 2.424207537628537e-05, | |
| "loss": 1.0753, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.5888, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 2.4122414511999727e-05, | |
| "loss": 1.1333, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 0.5952, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 2.4001825176063943e-05, | |
| "loss": 1.0526, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 0.6016, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 2.3880319641958862e-05, | |
| "loss": 1.0355, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 0.608, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 2.375791027641521e-05, | |
| "loss": 1.1634, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 2.3634609538154882e-05, | |
| "loss": 1.0915, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.6208, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 2.3510429976622975e-05, | |
| "loss": 1.0805, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 0.6272, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 2.338538423071045e-05, | |
| "loss": 1.0966, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 0.6336, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 2.32594850274678e-05, | |
| "loss": 1.0045, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 2.3132745180809675e-05, | |
| "loss": 1.0359, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6464, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 2.3005177590210713e-05, | |
| "loss": 1.1221, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 0.6528, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 2.287679523939261e-05, | |
| "loss": 1.1291, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 0.6592, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 2.2747611195002687e-05, | |
| "loss": 1.0904, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 0.6656, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 2.2617638605283966e-05, | |
| "loss": 1.1363, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 0.672, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 2.248689069873693e-05, | |
| "loss": 1.1025, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6784, | |
| "grad_norm": 2.75, | |
| "learning_rate": 2.235538078277317e-05, | |
| "loss": 1.096, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 0.6848, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 2.222312224236093e-05, | |
| "loss": 1.084, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 0.6912, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 2.2090128538662844e-05, | |
| "loss": 1.1673, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 0.6976, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 2.1956413207665835e-05, | |
| "loss": 1.1464, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 2.1821989858803438e-05, | |
| "loss": 1.0856, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.7104, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 2.1686872173570655e-05, | |
| "loss": 1.1303, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 0.7168, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 2.1551073904131452e-05, | |
| "loss": 1.1097, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 0.7232, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 2.141460887191909e-05, | |
| "loss": 1.1261, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 0.7296, | |
| "grad_norm": 1.9921875, | |
| "learning_rate": 2.1277490966229375e-05, | |
| "loss": 1.1666, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 0.736, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 2.1139734142807027e-05, | |
| "loss": 1.1462, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.7424, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 2.100135242242528e-05, | |
| "loss": 1.0577, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 0.7488, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 2.0862359889458835e-05, | |
| "loss": 1.1435, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 0.7552, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 2.07227706904504e-05, | |
| "loss": 1.101, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 0.7616, | |
| "grad_norm": 2.59375, | |
| "learning_rate": 2.0582599032670846e-05, | |
| "loss": 1.0775, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 2.0441859182673195e-05, | |
| "loss": 1.0275, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7744, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 2.0300565464840613e-05, | |
| "loss": 1.1269, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 0.7808, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 2.0158732259928453e-05, | |
| "loss": 0.993, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 0.7872, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 2.0016374003600604e-05, | |
| "loss": 1.126, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 0.7936, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 1.9873505184960255e-05, | |
| "loss": 1.0847, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.9730140345075187e-05, | |
| "loss": 1.0819, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.8064, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 1.9586294075497825e-05, | |
| "loss": 1.0683, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 0.8128, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 1.9441981016780074e-05, | |
| "loss": 1.1951, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 0.8192, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.929721585698326e-05, | |
| "loss": 1.0735, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 0.8256, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.9152013330183175e-05, | |
| "loss": 1.0937, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.9006388214970442e-05, | |
| "loss": 1.1384, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.8384, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.8860355332946405e-05, | |
| "loss": 1.0922, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 0.8448, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.871392954721454e-05, | |
| "loss": 1.1578, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 0.8512, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.856712576086774e-05, | |
| "loss": 1.1425, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 0.8576, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.8419958915471483e-05, | |
| "loss": 1.0089, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 0.864, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.8272443989543102e-05, | |
| "loss": 1.1178, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.8704, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 1.8124595997027255e-05, | |
| "loss": 1.1146, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 0.8768, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.7976429985767842e-05, | |
| "loss": 1.1538, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 0.8832, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 1.782796103597645e-05, | |
| "loss": 1.1488, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 0.8896, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.7679204258697482e-05, | |
| "loss": 1.0986, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 2.328125, | |
| "learning_rate": 1.7530174794270187e-05, | |
| "loss": 1.1119, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.9024, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 1.738088781078768e-05, | |
| "loss": 1.0861, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 0.9088, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.723135850255315e-05, | |
| "loss": 1.0417, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 0.9152, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.7081602088533396e-05, | |
| "loss": 1.0948, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.693163381080984e-05, | |
| "loss": 1.1347, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 0.928, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 1.6781468933027233e-05, | |
| "loss": 1.0932, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.9344, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 1.6631122738840098e-05, | |
| "loss": 1.1218, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 0.9408, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 1.6480610530357198e-05, | |
| "loss": 1.0991, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 0.9472, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.6329947626584088e-05, | |
| "loss": 1.0969, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 0.9536, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.617914936186395e-05, | |
| "loss": 1.0271, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.6028231084316914e-05, | |
| "loss": 1.0578, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.9664, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 1.5877208154277893e-05, | |
| "loss": 1.033, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 0.9728, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 1.5726095942733252e-05, | |
| "loss": 1.0599, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 0.9792, | |
| "grad_norm": 3.4375, | |
| "learning_rate": 1.5574909829756355e-05, | |
| "loss": 1.1695, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 0.9856, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 1.5423665202942176e-05, | |
| "loss": 1.0135, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 0.992, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.5272377455841203e-05, | |
| "loss": 1.0969, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.9984, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 1.5121061986392642e-05, | |
| "loss": 1.1522, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 1.00448, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 1.4969734195357282e-05, | |
| "loss": 1.0479, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 1.01088, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 1.481840948474997e-05, | |
| "loss": 1.01, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 1.01728, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 1.466710325627205e-05, | |
| "loss": 1.0202, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 1.02368, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.4515830909743756e-05, | |
| "loss": 1.0148, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.03008, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.436460784153685e-05, | |
| "loss": 0.8966, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 1.03648, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 1.4213449443007595e-05, | |
| "loss": 0.9647, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 1.04288, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 1.4062371098930216e-05, | |
| "loss": 0.9537, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 1.04928, | |
| "grad_norm": 1.890625, | |
| "learning_rate": 1.3911388185931075e-05, | |
| "loss": 0.95, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 1.05568, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.3760516070923629e-05, | |
| "loss": 1.0116, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.06208, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.360977010954442e-05, | |
| "loss": 1.01, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.06848, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 1.3459165644590173e-05, | |
| "loss": 1.0077, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 1.07488, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 1.3308718004456223e-05, | |
| "loss": 1.0607, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 1.08128, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.3158442501576419e-05, | |
| "loss": 0.9339, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 1.08768, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 1.3008354430864615e-05, | |
| "loss": 0.9697, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.09408, | |
| "grad_norm": 2.5, | |
| "learning_rate": 1.2858469068157982e-05, | |
| "loss": 0.9857, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 1.10048, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 1.270880166866224e-05, | |
| "loss": 1.0362, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 1.10688, | |
| "grad_norm": 2.125, | |
| "learning_rate": 1.2559367465398994e-05, | |
| "loss": 1.0572, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 1.11328, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 1.2410181667655342e-05, | |
| "loss": 1.0424, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 1.11968, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.2261259459435866e-05, | |
| "loss": 0.9826, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.12608, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 1.2112615997917243e-05, | |
| "loss": 1.06, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 1.13248, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.1964266411905538e-05, | |
| "loss": 0.9296, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 1.13888, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.1816225800296402e-05, | |
| "loss": 0.9885, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 1.14528, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.166850923053836e-05, | |
| "loss": 1.0868, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 1.15168, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 1.1521131737099206e-05, | |
| "loss": 1.0495, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.15808, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 1.1374108319935852e-05, | |
| "loss": 0.9584, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 1.16448, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.1227453942967615e-05, | |
| "loss": 0.9654, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 1.17088, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 1.1081183532553197e-05, | |
| "loss": 1.039, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 1.17728, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.0935311975971545e-05, | |
| "loss": 1.0529, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 1.18368, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 1.0789854119906566e-05, | |
| "loss": 0.9898, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.19008, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 1.06448247689361e-05, | |
| "loss": 0.9998, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 1.19648, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 1.050023868402509e-05, | |
| "loss": 0.9979, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 1.20288, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.035611058102322e-05, | |
| "loss": 0.9595, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 1.20928, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 1.0212455129167196e-05, | |
| "loss": 1.0216, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 1.21568, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 1.0069286949587653e-05, | |
| "loss": 0.9082, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.22208, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 9.926620613821094e-06, | |
| "loss": 0.9402, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 1.22848, | |
| "grad_norm": 3.015625, | |
| "learning_rate": 9.784470642326767e-06, | |
| "loss": 1.0484, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 1.23488, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 9.642851503008788e-06, | |
| "loss": 1.0093, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 1.24128, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 9.50177760974366e-06, | |
| "loss": 0.9979, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 1.24768, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 9.361263320913159e-06, | |
| "loss": 0.957, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.25408, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 9.22132293794303e-06, | |
| "loss": 1.0589, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 1.26048, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 9.081970703847344e-06, | |
| "loss": 1.0065, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 1.26688, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 8.943220801778871e-06, | |
| "loss": 0.9914, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 1.27328, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 8.805087353585561e-06, | |
| "loss": 0.9532, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 1.27968, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 8.667584418373177e-06, | |
| "loss": 0.961, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.2860800000000001, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 8.530725991074432e-06, | |
| "loss": 0.9762, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 1.29248, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 8.39452600102455e-06, | |
| "loss": 1.045, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 1.29888, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 8.25899831054357e-06, | |
| "loss": 0.9369, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 1.30528, | |
| "grad_norm": 2.953125, | |
| "learning_rate": 8.124156713525451e-06, | |
| "loss": 0.9506, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 1.31168, | |
| "grad_norm": 2.625, | |
| "learning_rate": 7.990014934034132e-06, | |
| "loss": 0.9693, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.31808, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 7.856586624906731e-06, | |
| "loss": 0.9546, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 1.3244799999999999, | |
| "grad_norm": 1.625, | |
| "learning_rate": 7.723885366363937e-06, | |
| "loss": 0.9469, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 1.33088, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 7.591924664627853e-06, | |
| "loss": 1.0108, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 1.33728, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 7.460717950547333e-06, | |
| "loss": 0.9485, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 1.34368, | |
| "grad_norm": 2.875, | |
| "learning_rate": 7.330278578230995e-06, | |
| "loss": 0.9319, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.35008, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 7.200619823688085e-06, | |
| "loss": 0.9649, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 1.35648, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 7.071754883477204e-06, | |
| "loss": 0.8863, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 1.36288, | |
| "grad_norm": 2.53125, | |
| "learning_rate": 6.943696873363214e-06, | |
| "loss": 0.9181, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 1.36928, | |
| "grad_norm": 2.65625, | |
| "learning_rate": 6.816458826982305e-06, | |
| "loss": 0.9467, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 1.37568, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 6.690053694515455e-06, | |
| "loss": 0.9795, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.38208, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 6.564494341370372e-06, | |
| "loss": 1.011, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 1.38848, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 6.4397935468720524e-06, | |
| "loss": 0.9757, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 1.3948800000000001, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 6.315964002962113e-06, | |
| "loss": 1.0207, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 1.40128, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 6.193018312907032e-06, | |
| "loss": 0.9519, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 1.40768, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 6.070968990015372e-06, | |
| "loss": 1.0672, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.41408, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.949828456364229e-06, | |
| "loss": 1.023, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 1.42048, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 5.829609041534874e-06, | |
| "loss": 1.0135, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 1.42688, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 5.710322981357886e-06, | |
| "loss": 0.9879, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 1.4332799999999999, | |
| "grad_norm": 1.75, | |
| "learning_rate": 5.591982416667802e-06, | |
| "loss": 1.0821, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 1.43968, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 5.474599392067413e-06, | |
| "loss": 1.031, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.44608, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 5.3581858547019095e-06, | |
| "loss": 1.0214, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 1.45248, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 5.242753653042872e-06, | |
| "loss": 0.967, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 1.45888, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 5.128314535682366e-06, | |
| "loss": 0.9439, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 1.46528, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 5.014880150137175e-06, | |
| "loss": 0.9451, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 1.47168, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 4.902462041663321e-06, | |
| "loss": 1.0129, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.47808, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 4.7910716520810185e-06, | |
| "loss": 1.0104, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 1.48448, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 4.680720318610107e-06, | |
| "loss": 0.9509, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 1.49088, | |
| "grad_norm": 2.25, | |
| "learning_rate": 4.571419272716175e-06, | |
| "loss": 1.0345, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 1.49728, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 4.463179638967428e-06, | |
| "loss": 1.0448, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 1.5036800000000001, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 4.356012433902436e-06, | |
| "loss": 1.0301, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.5100799999999999, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 4.249928564908896e-06, | |
| "loss": 1.0551, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 1.51648, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 4.144938829113458e-06, | |
| "loss": 1.0107, | |
| "step": 2370 | |
| }, | |
| { | |
| "epoch": 1.52288, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 4.0410539122828275e-06, | |
| "loss": 0.9923, | |
| "step": 2380 | |
| }, | |
| { | |
| "epoch": 1.52928, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 3.938284387736164e-06, | |
| "loss": 0.9977, | |
| "step": 2390 | |
| }, | |
| { | |
| "epoch": 1.5356800000000002, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 3.8366407152689374e-06, | |
| "loss": 1.0079, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.54208, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 3.7361332400883608e-06, | |
| "loss": 1.0236, | |
| "step": 2410 | |
| }, | |
| { | |
| "epoch": 1.54848, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 3.636772191760427e-06, | |
| "loss": 0.9698, | |
| "step": 2420 | |
| }, | |
| { | |
| "epoch": 1.55488, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 3.53856768316878e-06, | |
| "loss": 1.0645, | |
| "step": 2430 | |
| }, | |
| { | |
| "epoch": 1.56128, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 3.441529709485424e-06, | |
| "loss": 0.9716, | |
| "step": 2440 | |
| }, | |
| { | |
| "epoch": 1.56768, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 3.3456681471534207e-06, | |
| "loss": 0.9717, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.57408, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 3.2509927528816896e-06, | |
| "loss": 1.0421, | |
| "step": 2460 | |
| }, | |
| { | |
| "epoch": 1.58048, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 3.1575131626519553e-06, | |
| "loss": 1.0386, | |
| "step": 2470 | |
| }, | |
| { | |
| "epoch": 1.5868799999999998, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 3.0652388907380187e-06, | |
| "loss": 0.9578, | |
| "step": 2480 | |
| }, | |
| { | |
| "epoch": 1.59328, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 2.9741793287373987e-06, | |
| "loss": 0.9847, | |
| "step": 2490 | |
| }, | |
| { | |
| "epoch": 1.59968, | |
| "grad_norm": 2.78125, | |
| "learning_rate": 2.8843437446154625e-06, | |
| "loss": 1.0303, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.60608, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 2.795741281762152e-06, | |
| "loss": 0.944, | |
| "step": 2510 | |
| }, | |
| { | |
| "epoch": 1.6124800000000001, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 2.708380958061358e-06, | |
| "loss": 1.0533, | |
| "step": 2520 | |
| }, | |
| { | |
| "epoch": 1.6188799999999999, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 2.6222716649730964e-06, | |
| "loss": 0.9802, | |
| "step": 2530 | |
| }, | |
| { | |
| "epoch": 1.62528, | |
| "grad_norm": 1.75, | |
| "learning_rate": 2.5374221666285474e-06, | |
| "loss": 0.9677, | |
| "step": 2540 | |
| }, | |
| { | |
| "epoch": 1.63168, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 2.4538410989380394e-06, | |
| "loss": 1.0376, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.63808, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 2.3715369687121103e-06, | |
| "loss": 0.9554, | |
| "step": 2560 | |
| }, | |
| { | |
| "epoch": 1.6444800000000002, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 2.2905181527956694e-06, | |
| "loss": 0.9778, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 1.65088, | |
| "grad_norm": 2.125, | |
| "learning_rate": 2.2107928972154264e-06, | |
| "loss": 1.0311, | |
| "step": 2580 | |
| }, | |
| { | |
| "epoch": 1.65728, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 2.132369316340608e-06, | |
| "loss": 0.9612, | |
| "step": 2590 | |
| }, | |
| { | |
| "epoch": 1.66368, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 2.0552553920570862e-06, | |
| "loss": 0.9553, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.67008, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.9794589729550022e-06, | |
| "loss": 0.9915, | |
| "step": 2610 | |
| }, | |
| { | |
| "epoch": 1.67648, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 1.9049877735299203e-06, | |
| "loss": 0.9905, | |
| "step": 2620 | |
| }, | |
| { | |
| "epoch": 1.68288, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 1.8318493733976672e-06, | |
| "loss": 0.9517, | |
| "step": 2630 | |
| }, | |
| { | |
| "epoch": 1.6892800000000001, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 1.760051216522882e-06, | |
| "loss": 1.0555, | |
| "step": 2640 | |
| }, | |
| { | |
| "epoch": 1.6956799999999999, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 1.6896006104613759e-06, | |
| "loss": 0.9605, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.70208, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 1.6205047256163813e-06, | |
| "loss": 0.9612, | |
| "step": 2660 | |
| }, | |
| { | |
| "epoch": 1.70848, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 1.5527705945087422e-06, | |
| "loss": 1.0219, | |
| "step": 2670 | |
| }, | |
| { | |
| "epoch": 1.71488, | |
| "grad_norm": 2.25, | |
| "learning_rate": 1.4864051110611565e-06, | |
| "loss": 1.019, | |
| "step": 2680 | |
| }, | |
| { | |
| "epoch": 1.7212800000000001, | |
| "grad_norm": 1.75, | |
| "learning_rate": 1.4214150298965217e-06, | |
| "loss": 1.0037, | |
| "step": 2690 | |
| }, | |
| { | |
| "epoch": 1.7276799999999999, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.357806965650447e-06, | |
| "loss": 0.9484, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.73408, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 1.2955873922980355e-06, | |
| "loss": 1.052, | |
| "step": 2710 | |
| }, | |
| { | |
| "epoch": 1.74048, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 1.2347626424949483e-06, | |
| "loss": 1.0051, | |
| "step": 2720 | |
| }, | |
| { | |
| "epoch": 1.74688, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 1.1753389069328862e-06, | |
| "loss": 0.9812, | |
| "step": 2730 | |
| }, | |
| { | |
| "epoch": 1.75328, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 1.1173222337095074e-06, | |
| "loss": 0.9297, | |
| "step": 2740 | |
| }, | |
| { | |
| "epoch": 1.75968, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.0607185277128435e-06, | |
| "loss": 0.996, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.76608, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 1.0055335500203278e-06, | |
| "loss": 0.9412, | |
| "step": 2760 | |
| }, | |
| { | |
| "epoch": 1.77248, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 9.517729173124095e-07, | |
| "loss": 1.0159, | |
| "step": 2770 | |
| }, | |
| { | |
| "epoch": 1.77888, | |
| "grad_norm": 1.875, | |
| "learning_rate": 8.994421013009191e-07, | |
| "loss": 0.9885, | |
| "step": 2780 | |
| }, | |
| { | |
| "epoch": 1.78528, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 8.485464281721416e-07, | |
| "loss": 1.1004, | |
| "step": 2790 | |
| }, | |
| { | |
| "epoch": 1.79168, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 7.990910780447347e-07, | |
| "loss": 0.9443, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.7980800000000001, | |
| "grad_norm": 2.734375, | |
| "learning_rate": 7.510810844425009e-07, | |
| "loss": 0.945, | |
| "step": 2810 | |
| }, | |
| { | |
| "epoch": 1.8044799999999999, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 7.045213337820633e-07, | |
| "loss": 1.1081, | |
| "step": 2820 | |
| }, | |
| { | |
| "epoch": 1.81088, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 6.594165648755562e-07, | |
| "loss": 1.0107, | |
| "step": 2830 | |
| }, | |
| { | |
| "epoch": 1.81728, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 6.157713684482896e-07, | |
| "loss": 1.0491, | |
| "step": 2840 | |
| }, | |
| { | |
| "epoch": 1.82368, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 5.735901866715243e-07, | |
| "loss": 0.9906, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.8300800000000002, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 5.328773127103403e-07, | |
| "loss": 1.0141, | |
| "step": 2860 | |
| }, | |
| { | |
| "epoch": 1.83648, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 4.936368902866884e-07, | |
| "loss": 0.9968, | |
| "step": 2870 | |
| }, | |
| { | |
| "epoch": 1.84288, | |
| "grad_norm": 1.7734375, | |
| "learning_rate": 4.5587291325764667e-07, | |
| "loss": 0.9854, | |
| "step": 2880 | |
| }, | |
| { | |
| "epoch": 1.84928, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 4.195892252089267e-07, | |
| "loss": 1.0206, | |
| "step": 2890 | |
| }, | |
| { | |
| "epoch": 1.85568, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 3.8478951906367744e-07, | |
| "loss": 0.9919, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.86208, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 3.51477336706621e-07, | |
| "loss": 1.0347, | |
| "step": 2910 | |
| }, | |
| { | |
| "epoch": 1.86848, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 3.196560686235611e-07, | |
| "loss": 0.9982, | |
| "step": 2920 | |
| }, | |
| { | |
| "epoch": 1.87488, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 2.893289535563093e-07, | |
| "loss": 0.978, | |
| "step": 2930 | |
| }, | |
| { | |
| "epoch": 1.8812799999999998, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 2.6049907817304155e-07, | |
| "loss": 0.9441, | |
| "step": 2940 | |
| }, | |
| { | |
| "epoch": 1.88768, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 2.331693767541432e-07, | |
| "loss": 0.9926, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.89408, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 2.0734263089356276e-07, | |
| "loss": 0.9828, | |
| "step": 2960 | |
| }, | |
| { | |
| "epoch": 1.90048, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 1.830214692156934e-07, | |
| "loss": 1.025, | |
| "step": 2970 | |
| }, | |
| { | |
| "epoch": 1.9068800000000001, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 1.6020836710784692e-07, | |
| "loss": 0.9925, | |
| "step": 2980 | |
| }, | |
| { | |
| "epoch": 1.9132799999999999, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.3890564646830194e-07, | |
| "loss": 0.9416, | |
| "step": 2990 | |
| }, | |
| { | |
| "epoch": 1.91968, | |
| "grad_norm": 1.921875, | |
| "learning_rate": 1.1911547546998636e-07, | |
| "loss": 1.0084, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.92608, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 1.008398683398054e-07, | |
| "loss": 0.9642, | |
| "step": 3010 | |
| }, | |
| { | |
| "epoch": 1.93248, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 8.408068515362411e-08, | |
| "loss": 1.0143, | |
| "step": 3020 | |
| }, | |
| { | |
| "epoch": 1.9388800000000002, | |
| "grad_norm": 2.125, | |
| "learning_rate": 6.883963164696527e-08, | |
| "loss": 0.9897, | |
| "step": 3030 | |
| }, | |
| { | |
| "epoch": 1.94528, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.5118259041391784e-08, | |
| "loss": 0.9477, | |
| "step": 3040 | |
| }, | |
| { | |
| "epoch": 1.95168, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 4.291796388662949e-08, | |
| "loss": 1.0092, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.95808, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 3.223998791842098e-08, | |
| "loss": 1.0761, | |
| "step": 3060 | |
| }, | |
| { | |
| "epoch": 1.96448, | |
| "grad_norm": 2.859375, | |
| "learning_rate": 2.3085417932148246e-08, | |
| "loss": 1.009, | |
| "step": 3070 | |
| }, | |
| { | |
| "epoch": 1.97088, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 1.5455185672219595e-08, | |
| "loss": 0.9905, | |
| "step": 3080 | |
| }, | |
| { | |
| "epoch": 1.97728, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 9.350067737233791e-09, | |
| "loss": 1.0572, | |
| "step": 3090 | |
| }, | |
| { | |
| "epoch": 1.98368, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 4.7706855009416186e-09, | |
| "loss": 0.9107, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.9900799999999998, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 1.717505049003143e-09, | |
| "loss": 1.0182, | |
| "step": 3110 | |
| }, | |
| { | |
| "epoch": 1.99648, | |
| "grad_norm": 1.9609375, | |
| "learning_rate": 1.9083713154732785e-10, | |
| "loss": 0.9777, | |
| "step": 3120 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 3124, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.9759783028737065e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |