| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 1000, | |
| "global_step": 31479, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.047650814828933574, | |
| "grad_norm": 2.0100979804992676, | |
| "learning_rate": 0.00019682327901140442, | |
| "loss": 2.3566, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.09530162965786715, | |
| "grad_norm": 1.877261996269226, | |
| "learning_rate": 0.00019364655802280888, | |
| "loss": 2.2178, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.09530162965786715, | |
| "eval_loss": 2.3788223266601562, | |
| "eval_runtime": 80.302, | |
| "eval_samples_per_second": 173.271, | |
| "eval_steps_per_second": 7.223, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.14295244448680072, | |
| "grad_norm": 1.7388309240341187, | |
| "learning_rate": 0.00019046983703421329, | |
| "loss": 2.1744, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1906032593157343, | |
| "grad_norm": 1.8366143703460693, | |
| "learning_rate": 0.00018729311604561772, | |
| "loss": 2.163, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.1906032593157343, | |
| "eval_loss": 2.3654611110687256, | |
| "eval_runtime": 80.3504, | |
| "eval_samples_per_second": 173.166, | |
| "eval_steps_per_second": 7.218, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.23825407414466787, | |
| "grad_norm": 1.6628751754760742, | |
| "learning_rate": 0.00018411639505702213, | |
| "loss": 2.1515, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.28590488897360145, | |
| "grad_norm": 1.6291817426681519, | |
| "learning_rate": 0.0001809396740684266, | |
| "loss": 2.1196, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.28590488897360145, | |
| "eval_loss": 2.3521649837493896, | |
| "eval_runtime": 80.224, | |
| "eval_samples_per_second": 173.439, | |
| "eval_steps_per_second": 7.23, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.333555703802535, | |
| "grad_norm": 1.7604336738586426, | |
| "learning_rate": 0.000177762953079831, | |
| "loss": 2.1074, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.3812065186314686, | |
| "grad_norm": 1.34886634349823, | |
| "learning_rate": 0.00017458623209123543, | |
| "loss": 2.0926, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.3812065186314686, | |
| "eval_loss": 2.3420486450195312, | |
| "eval_runtime": 80.1942, | |
| "eval_samples_per_second": 173.504, | |
| "eval_steps_per_second": 7.232, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.42885733346040217, | |
| "grad_norm": 1.3510360717773438, | |
| "learning_rate": 0.00017140951110263986, | |
| "loss": 2.074, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.47650814828933574, | |
| "grad_norm": 1.272275447845459, | |
| "learning_rate": 0.0001682327901140443, | |
| "loss": 2.0752, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.47650814828933574, | |
| "eval_loss": 2.3270885944366455, | |
| "eval_runtime": 80.2619, | |
| "eval_samples_per_second": 173.358, | |
| "eval_steps_per_second": 7.226, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.5241589631182694, | |
| "grad_norm": 1.289753794670105, | |
| "learning_rate": 0.0001650560691254487, | |
| "loss": 2.0487, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.5718097779472029, | |
| "grad_norm": 1.1615971326828003, | |
| "learning_rate": 0.00016187934813685314, | |
| "loss": 2.0437, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.5718097779472029, | |
| "eval_loss": 2.3274528980255127, | |
| "eval_runtime": 80.2214, | |
| "eval_samples_per_second": 173.445, | |
| "eval_steps_per_second": 7.23, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.6194605927761365, | |
| "grad_norm": 1.3484673500061035, | |
| "learning_rate": 0.00015870262714825757, | |
| "loss": 2.0134, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.66711140760507, | |
| "grad_norm": 1.4737777709960938, | |
| "learning_rate": 0.000155525906159662, | |
| "loss": 2.0379, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.66711140760507, | |
| "eval_loss": 2.3164169788360596, | |
| "eval_runtime": 80.2177, | |
| "eval_samples_per_second": 173.453, | |
| "eval_steps_per_second": 7.23, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.7147622224340037, | |
| "grad_norm": 1.1502068042755127, | |
| "learning_rate": 0.00015234918517106642, | |
| "loss": 1.9916, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.7624130372629372, | |
| "grad_norm": 1.2299320697784424, | |
| "learning_rate": 0.00014917246418247085, | |
| "loss": 2.0068, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.7624130372629372, | |
| "eval_loss": 2.311408042907715, | |
| "eval_runtime": 80.2576, | |
| "eval_samples_per_second": 173.367, | |
| "eval_steps_per_second": 7.227, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.8100638520918708, | |
| "grad_norm": 1.2537345886230469, | |
| "learning_rate": 0.00014599574319387528, | |
| "loss": 1.9886, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.8577146669208043, | |
| "grad_norm": 1.0486429929733276, | |
| "learning_rate": 0.00014281902220527972, | |
| "loss": 1.9882, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.8577146669208043, | |
| "eval_loss": 2.304290294647217, | |
| "eval_runtime": 80.1372, | |
| "eval_samples_per_second": 173.627, | |
| "eval_steps_per_second": 7.238, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.905365481749738, | |
| "grad_norm": 1.1815516948699951, | |
| "learning_rate": 0.00013964230121668413, | |
| "loss": 1.9732, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.9530162965786715, | |
| "grad_norm": 1.2301689386367798, | |
| "learning_rate": 0.0001364655802280886, | |
| "loss": 1.9787, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.9530162965786715, | |
| "eval_loss": 2.2939772605895996, | |
| "eval_runtime": 80.1592, | |
| "eval_samples_per_second": 173.579, | |
| "eval_steps_per_second": 7.236, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.0006671114076051, | |
| "grad_norm": 1.497831106185913, | |
| "learning_rate": 0.000133288859239493, | |
| "loss": 1.9557, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.0483179262365387, | |
| "grad_norm": 1.3323341608047485, | |
| "learning_rate": 0.00013011213825089743, | |
| "loss": 1.7231, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.0483179262365387, | |
| "eval_loss": 2.313231945037842, | |
| "eval_runtime": 80.1199, | |
| "eval_samples_per_second": 173.665, | |
| "eval_steps_per_second": 7.239, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.0959687410654722, | |
| "grad_norm": 1.8000659942626953, | |
| "learning_rate": 0.00012693541726230184, | |
| "loss": 1.714, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.1436195558944058, | |
| "grad_norm": 1.2369180917739868, | |
| "learning_rate": 0.0001237586962737063, | |
| "loss": 1.7114, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.1436195558944058, | |
| "eval_loss": 2.313917875289917, | |
| "eval_runtime": 80.1492, | |
| "eval_samples_per_second": 173.601, | |
| "eval_steps_per_second": 7.237, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.1912703707233394, | |
| "grad_norm": 1.431038498878479, | |
| "learning_rate": 0.0001205819752851107, | |
| "loss": 1.7283, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.238921185552273, | |
| "grad_norm": 1.4570106267929077, | |
| "learning_rate": 0.00011740525429651514, | |
| "loss": 1.7033, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.238921185552273, | |
| "eval_loss": 2.310853958129883, | |
| "eval_runtime": 80.0945, | |
| "eval_samples_per_second": 173.72, | |
| "eval_steps_per_second": 7.241, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.2865720003812064, | |
| "grad_norm": 1.557187795639038, | |
| "learning_rate": 0.00011422853330791956, | |
| "loss": 1.7289, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.33422281521014, | |
| "grad_norm": 1.5775034427642822, | |
| "learning_rate": 0.000111051812319324, | |
| "loss": 1.7151, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.33422281521014, | |
| "eval_loss": 2.300920009613037, | |
| "eval_runtime": 80.1537, | |
| "eval_samples_per_second": 173.591, | |
| "eval_steps_per_second": 7.236, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.3818736300390737, | |
| "grad_norm": 1.2451566457748413, | |
| "learning_rate": 0.00010787509133072841, | |
| "loss": 1.7218, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.4295244448680071, | |
| "grad_norm": 1.650688886642456, | |
| "learning_rate": 0.00010469837034213286, | |
| "loss": 1.7202, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.4295244448680071, | |
| "eval_loss": 2.290478467941284, | |
| "eval_runtime": 80.1852, | |
| "eval_samples_per_second": 173.523, | |
| "eval_steps_per_second": 7.233, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.4771752596969407, | |
| "grad_norm": 1.4705020189285278, | |
| "learning_rate": 0.00010152164935353727, | |
| "loss": 1.721, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.5248260745258744, | |
| "grad_norm": 1.530394434928894, | |
| "learning_rate": 9.834492836494172e-05, | |
| "loss": 1.7261, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.5248260745258744, | |
| "eval_loss": 2.2944624423980713, | |
| "eval_runtime": 80.1122, | |
| "eval_samples_per_second": 173.682, | |
| "eval_steps_per_second": 7.24, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.572476889354808, | |
| "grad_norm": 1.667024850845337, | |
| "learning_rate": 9.516820737634614e-05, | |
| "loss": 1.7072, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.6201277041837416, | |
| "grad_norm": 1.4624521732330322, | |
| "learning_rate": 9.199148638775057e-05, | |
| "loss": 1.7091, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.6201277041837416, | |
| "eval_loss": 2.2861549854278564, | |
| "eval_runtime": 80.0947, | |
| "eval_samples_per_second": 173.719, | |
| "eval_steps_per_second": 7.241, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.6677785190126753, | |
| "grad_norm": 1.7141919136047363, | |
| "learning_rate": 8.881476539915499e-05, | |
| "loss": 1.7281, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.7154293338416087, | |
| "grad_norm": 1.367767333984375, | |
| "learning_rate": 8.563804441055943e-05, | |
| "loss": 1.7098, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.7154293338416087, | |
| "eval_loss": 2.2811758518218994, | |
| "eval_runtime": 80.1424, | |
| "eval_samples_per_second": 173.616, | |
| "eval_steps_per_second": 7.237, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.7630801486705423, | |
| "grad_norm": 1.530991792678833, | |
| "learning_rate": 8.246132342196385e-05, | |
| "loss": 1.6994, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.8107309634994757, | |
| "grad_norm": 1.4421322345733643, | |
| "learning_rate": 7.928460243336828e-05, | |
| "loss": 1.6943, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.8107309634994757, | |
| "eval_loss": 2.273425579071045, | |
| "eval_runtime": 80.1385, | |
| "eval_samples_per_second": 173.624, | |
| "eval_steps_per_second": 7.237, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.8583817783284093, | |
| "grad_norm": 1.5695687532424927, | |
| "learning_rate": 7.610788144477272e-05, | |
| "loss": 1.7, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.906032593157343, | |
| "grad_norm": 1.6507039070129395, | |
| "learning_rate": 7.293116045617714e-05, | |
| "loss": 1.7035, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.906032593157343, | |
| "eval_loss": 2.266268730163574, | |
| "eval_runtime": 80.1631, | |
| "eval_samples_per_second": 173.571, | |
| "eval_steps_per_second": 7.235, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.9536834079862766, | |
| "grad_norm": 1.41545832157135, | |
| "learning_rate": 6.975443946758157e-05, | |
| "loss": 1.6948, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.0013342228152102, | |
| "grad_norm": 1.3855451345443726, | |
| "learning_rate": 6.657771847898599e-05, | |
| "loss": 1.6776, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.0013342228152102, | |
| "eval_loss": 2.302978515625, | |
| "eval_runtime": 80.1675, | |
| "eval_samples_per_second": 173.562, | |
| "eval_steps_per_second": 7.235, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.048985037644144, | |
| "grad_norm": 1.3997050523757935, | |
| "learning_rate": 6.340099749039043e-05, | |
| "loss": 1.438, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.0966358524730775, | |
| "grad_norm": 1.4828859567642212, | |
| "learning_rate": 6.0224276501794854e-05, | |
| "loss": 1.4406, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.0966358524730775, | |
| "eval_loss": 2.3172175884246826, | |
| "eval_runtime": 80.1748, | |
| "eval_samples_per_second": 173.546, | |
| "eval_steps_per_second": 7.234, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.1442866673020107, | |
| "grad_norm": 1.8176885843276978, | |
| "learning_rate": 5.704755551319928e-05, | |
| "loss": 1.4555, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.1919374821309443, | |
| "grad_norm": 1.48106050491333, | |
| "learning_rate": 5.387083452460371e-05, | |
| "loss": 1.4659, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.1919374821309443, | |
| "eval_loss": 2.3182783126831055, | |
| "eval_runtime": 80.2101, | |
| "eval_samples_per_second": 173.47, | |
| "eval_steps_per_second": 7.231, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.239588296959878, | |
| "grad_norm": 1.6957001686096191, | |
| "learning_rate": 5.0694113536008136e-05, | |
| "loss": 1.448, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 2.2872391117888116, | |
| "grad_norm": 1.3845641613006592, | |
| "learning_rate": 4.7517392547412564e-05, | |
| "loss": 1.4608, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.2872391117888116, | |
| "eval_loss": 2.318488836288452, | |
| "eval_runtime": 80.1689, | |
| "eval_samples_per_second": 173.559, | |
| "eval_steps_per_second": 7.235, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.334889926617745, | |
| "grad_norm": 1.9913188219070435, | |
| "learning_rate": 4.434067155881699e-05, | |
| "loss": 1.439, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 2.382540741446679, | |
| "grad_norm": 1.8244202136993408, | |
| "learning_rate": 4.116395057022142e-05, | |
| "loss": 1.4423, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.382540741446679, | |
| "eval_loss": 2.3121349811553955, | |
| "eval_runtime": 80.1537, | |
| "eval_samples_per_second": 173.591, | |
| "eval_steps_per_second": 7.236, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.4301915562756125, | |
| "grad_norm": 1.347023606300354, | |
| "learning_rate": 3.7987229581625846e-05, | |
| "loss": 1.4506, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 2.477842371104546, | |
| "grad_norm": 1.49163019657135, | |
| "learning_rate": 3.481050859303028e-05, | |
| "loss": 1.4378, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.477842371104546, | |
| "eval_loss": 2.3090391159057617, | |
| "eval_runtime": 80.1708, | |
| "eval_samples_per_second": 173.554, | |
| "eval_steps_per_second": 7.235, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.5254931859334793, | |
| "grad_norm": 1.7945301532745361, | |
| "learning_rate": 3.163378760443471e-05, | |
| "loss": 1.4436, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.573144000762413, | |
| "grad_norm": 1.5082517862319946, | |
| "learning_rate": 2.8457066615839136e-05, | |
| "loss": 1.4277, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.573144000762413, | |
| "eval_loss": 2.3082542419433594, | |
| "eval_runtime": 80.1802, | |
| "eval_samples_per_second": 173.534, | |
| "eval_steps_per_second": 7.234, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.6207948155913465, | |
| "grad_norm": 1.4329321384429932, | |
| "learning_rate": 2.5280345627243563e-05, | |
| "loss": 1.4301, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 2.66844563042028, | |
| "grad_norm": 1.2606436014175415, | |
| "learning_rate": 2.2103624638647987e-05, | |
| "loss": 1.4251, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 2.66844563042028, | |
| "eval_loss": 2.2960703372955322, | |
| "eval_runtime": 80.1531, | |
| "eval_samples_per_second": 173.593, | |
| "eval_steps_per_second": 7.236, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 2.716096445249214, | |
| "grad_norm": 1.4542068243026733, | |
| "learning_rate": 1.8926903650052415e-05, | |
| "loss": 1.4248, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 2.7637472600781474, | |
| "grad_norm": 1.6642916202545166, | |
| "learning_rate": 1.5750182661456846e-05, | |
| "loss": 1.4219, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.7637472600781474, | |
| "eval_loss": 2.296442985534668, | |
| "eval_runtime": 80.1753, | |
| "eval_samples_per_second": 173.545, | |
| "eval_steps_per_second": 7.234, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.811398074907081, | |
| "grad_norm": 2.0301756858825684, | |
| "learning_rate": 1.2573461672861273e-05, | |
| "loss": 1.4281, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 2.8590488897360142, | |
| "grad_norm": 1.6031594276428223, | |
| "learning_rate": 9.3967406842657e-06, | |
| "loss": 1.434, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.8590488897360142, | |
| "eval_loss": 2.2933690547943115, | |
| "eval_runtime": 80.1482, | |
| "eval_samples_per_second": 173.603, | |
| "eval_steps_per_second": 7.237, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.9066997045649483, | |
| "grad_norm": 1.6658378839492798, | |
| "learning_rate": 6.22001969567013e-06, | |
| "loss": 1.4291, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 2.9543505193938815, | |
| "grad_norm": 1.589982032775879, | |
| "learning_rate": 3.0432987070745578e-06, | |
| "loss": 1.4279, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.9543505193938815, | |
| "eval_loss": 2.2906086444854736, | |
| "eval_runtime": 80.2746, | |
| "eval_samples_per_second": 173.33, | |
| "eval_steps_per_second": 7.225, | |
| "step": 31000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 31479, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 50000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.934891962368e+16, | |
| "train_batch_size": 24, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |