{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 445, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.056617126680820945, "grad_norm": 9.345914802036008, "learning_rate": 3.555555555555556e-06, "loss": 1.6466, "step": 5 }, { "epoch": 0.11323425336164189, "grad_norm": 1.4064303624435393, "learning_rate": 8.000000000000001e-06, "loss": 1.4509, "step": 10 }, { "epoch": 0.16985138004246284, "grad_norm": 1.032369433205734, "learning_rate": 1.2444444444444446e-05, "loss": 1.3033, "step": 15 }, { "epoch": 0.22646850672328378, "grad_norm": 0.628468478208374, "learning_rate": 1.688888888888889e-05, "loss": 1.1903, "step": 20 }, { "epoch": 0.28308563340410475, "grad_norm": 0.5195360553152225, "learning_rate": 2.1333333333333335e-05, "loss": 1.158, "step": 25 }, { "epoch": 0.33970276008492567, "grad_norm": 0.4036705364419785, "learning_rate": 2.577777777777778e-05, "loss": 1.1391, "step": 30 }, { "epoch": 0.39631988676574664, "grad_norm": 0.3608252286544406, "learning_rate": 3.0222222222222225e-05, "loss": 1.1003, "step": 35 }, { "epoch": 0.45293701344656756, "grad_norm": 0.3983488339178684, "learning_rate": 3.466666666666667e-05, "loss": 1.1105, "step": 40 }, { "epoch": 0.5095541401273885, "grad_norm": 0.4214634639729354, "learning_rate": 3.9111111111111115e-05, "loss": 1.0798, "step": 45 }, { "epoch": 0.5661712668082095, "grad_norm": 0.3709760572158285, "learning_rate": 3.9990131207314634e-05, "loss": 1.0626, "step": 50 }, { "epoch": 0.6227883934890305, "grad_norm": 0.34629784433771854, "learning_rate": 3.995005592832541e-05, "loss": 1.0209, "step": 55 }, { "epoch": 0.6794055201698513, "grad_norm": 0.3360454606085404, "learning_rate": 3.98792191091036e-05, "loss": 1.0534, "step": 60 }, { "epoch": 0.7360226468506723, "grad_norm": 0.3500842170697645, "learning_rate": 3.97777299748901e-05, "loss": 1.0347, "step": 65 }, { "epoch": 0.7926397735314933, "grad_norm": 0.32535259833523167, "learning_rate": 3.964574501457378e-05, "loss": 1.0346, "step": 70 }, { "epoch": 0.8492569002123143, "grad_norm": 0.38007371928778144, "learning_rate": 3.948346773939699e-05, "loss": 1.0339, "step": 75 }, { "epoch": 0.9058740268931351, "grad_norm": 0.32408307517527646, "learning_rate": 3.9291148369155964e-05, "loss": 1.0467, "step": 80 }, { "epoch": 0.9624911535739561, "grad_norm": 0.33959050637050797, "learning_rate": 3.906908344638002e-05, "loss": 1.0389, "step": 85 }, { "epoch": 1.0113234253361643, "grad_norm": 0.7280037640897925, "learning_rate": 3.8817615379084514e-05, "loss": 0.9739, "step": 90 }, { "epoch": 1.0679405520169851, "grad_norm": 0.44724626906705445, "learning_rate": 3.853713191280242e-05, "loss": 0.9129, "step": 95 }, { "epoch": 1.124557678697806, "grad_norm": 0.38870656597286274, "learning_rate": 3.8228065532708905e-05, "loss": 0.8967, "step": 100 }, { "epoch": 1.181174805378627, "grad_norm": 0.3235345462787965, "learning_rate": 3.78908927967605e-05, "loss": 0.8757, "step": 105 }, { "epoch": 1.237791932059448, "grad_norm": 0.3660561663547892, "learning_rate": 3.7526133600877275e-05, "loss": 0.8733, "step": 110 }, { "epoch": 1.294409058740269, "grad_norm": 0.34757796373477073, "learning_rate": 3.7134350377301e-05, "loss": 0.8918, "step": 115 }, { "epoch": 1.3510261854210899, "grad_norm": 0.40142101730391505, "learning_rate": 3.671614722736541e-05, "loss": 0.8499, "step": 120 }, { "epoch": 1.4076433121019107, "grad_norm": 0.3697004670926489, "learning_rate": 3.627216899001575e-05, "loss": 0.8786, "step": 125 }, { "epoch": 1.4642604387827318, "grad_norm": 0.3590589590627629, "learning_rate": 3.580310024751381e-05, "loss": 0.8676, "step": 130 }, { "epoch": 1.5208775654635527, "grad_norm": 0.3216289514493238, "learning_rate": 3.530966426986177e-05, "loss": 0.8709, "step": 135 }, { "epoch": 1.5774946921443735, "grad_norm": 0.342086453518653, "learning_rate": 3.47926218995722e-05, "loss": 0.8808, "step": 140 }, { "epoch": 1.6341118188251946, "grad_norm": 0.33735873679395056, "learning_rate": 3.425277037850411e-05, "loss": 0.8786, "step": 145 }, { "epoch": 1.6907289455060157, "grad_norm": 0.3266369058468826, "learning_rate": 3.369094211857378e-05, "loss": 0.8836, "step": 150 }, { "epoch": 1.7473460721868364, "grad_norm": 0.33354982207230655, "learning_rate": 3.310800341823588e-05, "loss": 0.8932, "step": 155 }, { "epoch": 1.8039631988676574, "grad_norm": 0.37884881926456787, "learning_rate": 3.250485312671411e-05, "loss": 0.8775, "step": 160 }, { "epoch": 1.8605803255484785, "grad_norm": 0.3199962873732413, "learning_rate": 3.188242125804078e-05, "loss": 0.8764, "step": 165 }, { "epoch": 1.9171974522292994, "grad_norm": 0.33394262511510897, "learning_rate": 3.124166755704261e-05, "loss": 0.8533, "step": 170 }, { "epoch": 1.9738145789101202, "grad_norm": 0.32517398854793483, "learning_rate": 3.058358001948381e-05, "loss": 0.8666, "step": 175 }, { "epoch": 2.0226468506723285, "grad_norm": 0.7033789518091728, "learning_rate": 2.9909173368648154e-05, "loss": 0.8138, "step": 180 }, { "epoch": 2.079263977353149, "grad_norm": 0.4334348051751711, "learning_rate": 2.921948749070925e-05, "loss": 0.692, "step": 185 }, { "epoch": 2.1358811040339702, "grad_norm": 0.360965781029978, "learning_rate": 2.8515585831301456e-05, "loss": 0.693, "step": 190 }, { "epoch": 2.1924982307147913, "grad_norm": 0.38547350455159185, "learning_rate": 2.7798553755763768e-05, "loss": 0.6868, "step": 195 }, { "epoch": 2.249115357395612, "grad_norm": 0.3931880052281354, "learning_rate": 2.7069496875585145e-05, "loss": 0.6739, "step": 200 }, { "epoch": 2.305732484076433, "grad_norm": 0.3884784347387114, "learning_rate": 2.6329539343631725e-05, "loss": 0.7261, "step": 205 }, { "epoch": 2.362349610757254, "grad_norm": 0.3790375769004355, "learning_rate": 2.557982212078459e-05, "loss": 0.6769, "step": 210 }, { "epoch": 2.418966737438075, "grad_norm": 0.444577993172877, "learning_rate": 2.4821501216660778e-05, "loss": 0.66, "step": 215 }, { "epoch": 2.475583864118896, "grad_norm": 0.39931903647400696, "learning_rate": 2.405574590713025e-05, "loss": 0.6698, "step": 220 }, { "epoch": 2.532200990799717, "grad_norm": 0.3768609234065524, "learning_rate": 2.328373693137726e-05, "loss": 0.6692, "step": 225 }, { "epoch": 2.588818117480538, "grad_norm": 0.3776714663088506, "learning_rate": 2.2506664671286087e-05, "loss": 0.6834, "step": 230 }, { "epoch": 2.6454352441613587, "grad_norm": 0.39731405215740095, "learning_rate": 2.1725727315958473e-05, "loss": 0.6939, "step": 235 }, { "epoch": 2.7020523708421798, "grad_norm": 0.38550352150548167, "learning_rate": 2.0942129014192854e-05, "loss": 0.6717, "step": 240 }, { "epoch": 2.758669497523001, "grad_norm": 0.3747659422492226, "learning_rate": 2.0157078017774228e-05, "loss": 0.6854, "step": 245 }, { "epoch": 2.8152866242038215, "grad_norm": 0.3637528547775645, "learning_rate": 1.9371784818437436e-05, "loss": 0.6776, "step": 250 }, { "epoch": 2.8719037508846426, "grad_norm": 0.3637325673769254, "learning_rate": 1.8587460281376673e-05, "loss": 0.6709, "step": 255 }, { "epoch": 2.9285208775654636, "grad_norm": 0.36039226577068273, "learning_rate": 1.7805313778179095e-05, "loss": 0.6871, "step": 260 }, { "epoch": 2.9851380042462843, "grad_norm": 0.40362246776091015, "learning_rate": 1.702655132206154e-05, "loss": 0.699, "step": 265 }, { "epoch": 3.0339702760084926, "grad_norm": 0.6631696971565146, "learning_rate": 1.6252373708285505e-05, "loss": 0.6092, "step": 270 }, { "epoch": 3.0905874026893136, "grad_norm": 0.8389362601790664, "learning_rate": 1.548397466261793e-05, "loss": 0.5176, "step": 275 }, { "epoch": 3.1472045293701343, "grad_norm": 0.589268360466198, "learning_rate": 1.4722539000692548e-05, "loss": 0.5212, "step": 280 }, { "epoch": 3.2038216560509554, "grad_norm": 0.43915442527669857, "learning_rate": 1.3969240801110088e-05, "loss": 0.5016, "step": 285 }, { "epoch": 3.2604387827317765, "grad_norm": 0.4070781926669548, "learning_rate": 1.3225241595094173e-05, "loss": 0.5126, "step": 290 }, { "epoch": 3.3170559094125975, "grad_norm": 0.43081414786416466, "learning_rate": 1.2491688575494337e-05, "loss": 0.5159, "step": 295 }, { "epoch": 3.373673036093418, "grad_norm": 0.4088148942355722, "learning_rate": 1.1769712827897825e-05, "loss": 0.4991, "step": 300 }, { "epoch": 3.4302901627742393, "grad_norm": 0.4118948241938903, "learning_rate": 1.106042758657758e-05, "loss": 0.5254, "step": 305 }, { "epoch": 3.4869072894550603, "grad_norm": 0.3865644301037302, "learning_rate": 1.0364926517965693e-05, "loss": 0.5241, "step": 310 }, { "epoch": 3.543524416135881, "grad_norm": 0.3854752948085822, "learning_rate": 9.684282034299053e-06, "loss": 0.5157, "step": 315 }, { "epoch": 3.600141542816702, "grad_norm": 0.407604576350077, "learning_rate": 9.019543640037363e-06, "loss": 0.5156, "step": 320 }, { "epoch": 3.656758669497523, "grad_norm": 0.4638718605403921, "learning_rate": 8.37173631360339e-06, "loss": 0.5289, "step": 325 }, { "epoch": 3.713375796178344, "grad_norm": 0.3931730718525121, "learning_rate": 7.741858926940475e-06, "loss": 0.5188, "step": 330 }, { "epoch": 3.769992922859165, "grad_norm": 0.38253451527183935, "learning_rate": 7.130882705324422e-06, "loss": 0.5207, "step": 335 }, { "epoch": 3.826610049539986, "grad_norm": 0.38648720259429115, "learning_rate": 6.539749729804539e-06, "loss": 0.4995, "step": 340 }, { "epoch": 3.8832271762208066, "grad_norm": 0.38272964362705975, "learning_rate": 5.969371484582887e-06, "loss": 0.5127, "step": 345 }, { "epoch": 3.9398443029016277, "grad_norm": 0.49283524260007355, "learning_rate": 5.4206274515717735e-06, "loss": 0.5265, "step": 350 }, { "epoch": 3.9964614295824488, "grad_norm": 0.38126560684002103, "learning_rate": 4.89436375429633e-06, "loss": 0.51, "step": 355 }, { "epoch": 4.045293701344657, "grad_norm": 0.4987038179586576, "learning_rate": 4.391391853233404e-06, "loss": 0.4291, "step": 360 }, { "epoch": 4.101910828025478, "grad_norm": 0.6611623093102118, "learning_rate": 3.91248729459831e-06, "loss": 0.4191, "step": 365 }, { "epoch": 4.158527954706298, "grad_norm": 0.4432670456926395, "learning_rate": 3.4583885145087613e-06, "loss": 0.3965, "step": 370 }, { "epoch": 4.21514508138712, "grad_norm": 0.39526501856399826, "learning_rate": 3.0297957003699284e-06, "loss": 0.3958, "step": 375 }, { "epoch": 4.2717622080679405, "grad_norm": 0.4240316741877897, "learning_rate": 2.6273697112361786e-06, "loss": 0.4148, "step": 380 }, { "epoch": 4.328379334748761, "grad_norm": 0.38954657574403195, "learning_rate": 2.2517310588143372e-06, "loss": 0.4139, "step": 385 }, { "epoch": 4.384996461429583, "grad_norm": 0.37671075657493147, "learning_rate": 1.903458950679613e-06, "loss": 0.4046, "step": 390 }, { "epoch": 4.441613588110403, "grad_norm": 0.36250832068930156, "learning_rate": 1.5830903971794765e-06, "loss": 0.4209, "step": 395 }, { "epoch": 4.498230714791224, "grad_norm": 0.40212560342113157, "learning_rate": 1.2911193834026548e-06, "loss": 0.405, "step": 400 }, { "epoch": 4.5548478414720455, "grad_norm": 0.39658771890552336, "learning_rate": 1.027996107489908e-06, "loss": 0.4317, "step": 405 }, { "epoch": 4.611464968152866, "grad_norm": 0.36214089770988905, "learning_rate": 7.941262864611387e-07, "loss": 0.3987, "step": 410 }, { "epoch": 4.668082094833688, "grad_norm": 0.36943830762867996, "learning_rate": 5.898705306291508e-07, "loss": 0.4221, "step": 415 }, { "epoch": 4.724699221514508, "grad_norm": 0.37942562913876593, "learning_rate": 4.155437875646828e-07, "loss": 0.411, "step": 420 }, { "epoch": 4.781316348195329, "grad_norm": 0.4608507906679758, "learning_rate": 2.714148564700914e-07, "loss": 0.4161, "step": 425 }, { "epoch": 4.83793347487615, "grad_norm": 0.3739139558569457, "learning_rate": 1.577059737104447e-07, "loss": 0.4183, "step": 430 }, { "epoch": 4.894550601556971, "grad_norm": 0.3875991971455823, "learning_rate": 7.459247014117488e-08, "loss": 0.4124, "step": 435 }, { "epoch": 4.951167728237792, "grad_norm": 0.38268369764129956, "learning_rate": 2.220250076060193e-08, "loss": 0.4, "step": 440 }, { "epoch": 5.0, "grad_norm": 0.3971882274114809, "learning_rate": 6.168471042067303e-10, "loss": 0.3991, "step": 445 }, { "epoch": 5.0, "step": 445, "total_flos": 801400648892416.0, "train_loss": 0.72607141398312, "train_runtime": 41183.9838, "train_samples_per_second": 1.372, "train_steps_per_second": 0.011 } ], "logging_steps": 5, "max_steps": 445, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 801400648892416.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }