{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9890909090909092, "eval_steps": 5, "global_step": 136, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07272727272727272, "eval_loss": 0.8767483830451965, "eval_runtime": 30.8146, "eval_samples_per_second": 15.285, "eval_steps_per_second": 3.829, "step": 5 }, { "epoch": 0.14545454545454545, "grad_norm": 3.115849256515503, "learning_rate": 3.571428571428572e-05, "loss": 3.2126, "step": 10 }, { "epoch": 0.14545454545454545, "eval_loss": 0.779396116733551, "eval_runtime": 27.2217, "eval_samples_per_second": 17.302, "eval_steps_per_second": 4.335, "step": 10 }, { "epoch": 0.21818181818181817, "eval_loss": 0.7435232996940613, "eval_runtime": 27.2836, "eval_samples_per_second": 17.263, "eval_steps_per_second": 4.325, "step": 15 }, { "epoch": 0.2909090909090909, "grad_norm": 2.84679913520813, "learning_rate": 7.142857142857143e-05, "loss": 2.9019, "step": 20 }, { "epoch": 0.2909090909090909, "eval_loss": 0.6930269598960876, "eval_runtime": 27.2747, "eval_samples_per_second": 17.269, "eval_steps_per_second": 4.326, "step": 20 }, { "epoch": 0.36363636363636365, "eval_loss": 0.6732496619224548, "eval_runtime": 27.2843, "eval_samples_per_second": 17.263, "eval_steps_per_second": 4.325, "step": 25 }, { "epoch": 0.43636363636363634, "grad_norm": 2.5042567253112793, "learning_rate": 9.998342337571565e-05, "loss": 2.9314, "step": 30 }, { "epoch": 0.43636363636363634, "eval_loss": 0.6518906950950623, "eval_runtime": 27.2739, "eval_samples_per_second": 17.269, "eval_steps_per_second": 4.326, "step": 30 }, { "epoch": 0.509090909090909, "eval_loss": 0.6448661088943481, "eval_runtime": 27.279, "eval_samples_per_second": 17.266, "eval_steps_per_second": 4.326, "step": 35 }, { "epoch": 0.5818181818181818, "grad_norm": 2.148580312728882, "learning_rate": 9.940439480455386e-05, "loss": 2.3866, "step": 40 }, { "epoch": 0.5818181818181818, "eval_loss": 0.6359825730323792, "eval_runtime": 27.2872, "eval_samples_per_second": 17.261, "eval_steps_per_second": 4.324, "step": 40 }, { "epoch": 0.6545454545454545, "eval_loss": 0.6275559663772583, "eval_runtime": 27.2773, "eval_samples_per_second": 17.267, "eval_steps_per_second": 4.326, "step": 45 }, { "epoch": 0.7272727272727273, "grad_norm": 3.116907835006714, "learning_rate": 9.800749368358009e-05, "loss": 2.7002, "step": 50 }, { "epoch": 0.7272727272727273, "eval_loss": 0.6227777004241943, "eval_runtime": 27.2702, "eval_samples_per_second": 17.272, "eval_steps_per_second": 4.327, "step": 50 }, { "epoch": 0.8, "eval_loss": 0.6195926070213318, "eval_runtime": 27.263, "eval_samples_per_second": 17.276, "eval_steps_per_second": 4.328, "step": 55 }, { "epoch": 0.8727272727272727, "grad_norm": 4.004441738128662, "learning_rate": 9.581584522435024e-05, "loss": 2.5389, "step": 60 }, { "epoch": 0.8727272727272727, "eval_loss": 0.6146515011787415, "eval_runtime": 27.269, "eval_samples_per_second": 17.272, "eval_steps_per_second": 4.327, "step": 60 }, { "epoch": 0.9454545454545454, "eval_loss": 0.610045850276947, "eval_runtime": 27.3064, "eval_samples_per_second": 17.249, "eval_steps_per_second": 4.321, "step": 65 }, { "epoch": 1.029090909090909, "grad_norm": 2.843431234359741, "learning_rate": 9.286573140381662e-05, "loss": 2.4095, "step": 70 }, { "epoch": 1.029090909090909, "eval_loss": 0.6076003909111023, "eval_runtime": 27.2492, "eval_samples_per_second": 17.285, "eval_steps_per_second": 4.33, "step": 70 }, { "epoch": 1.1018181818181818, "eval_loss": 0.6160494089126587, "eval_runtime": 27.2598, "eval_samples_per_second": 17.278, "eval_steps_per_second": 4.329, "step": 75 }, { "epoch": 1.1745454545454546, "grad_norm": 4.093331336975098, "learning_rate": 8.920599032883554e-05, "loss": 2.0692, "step": 80 }, { "epoch": 1.1745454545454546, "eval_loss": 0.6218172311782837, "eval_runtime": 27.2772, "eval_samples_per_second": 17.267, "eval_steps_per_second": 4.326, "step": 80 }, { "epoch": 1.2472727272727273, "eval_loss": 0.6184111833572388, "eval_runtime": 27.3075, "eval_samples_per_second": 17.248, "eval_steps_per_second": 4.321, "step": 85 }, { "epoch": 1.32, "grad_norm": 2.453794240951538, "learning_rate": 8.489720773831717e-05, "loss": 1.8616, "step": 90 }, { "epoch": 1.32, "eval_loss": 0.6163813471794128, "eval_runtime": 27.2607, "eval_samples_per_second": 17.278, "eval_steps_per_second": 4.329, "step": 90 }, { "epoch": 1.3927272727272726, "eval_loss": 0.6139249801635742, "eval_runtime": 27.2587, "eval_samples_per_second": 17.279, "eval_steps_per_second": 4.329, "step": 95 }, { "epoch": 1.4654545454545453, "grad_norm": 3.4409964084625244, "learning_rate": 8.001071402741842e-05, "loss": 2.1618, "step": 100 }, { "epoch": 1.4654545454545453, "eval_loss": 0.6118062734603882, "eval_runtime": 27.245, "eval_samples_per_second": 17.288, "eval_steps_per_second": 4.331, "step": 100 }, { "epoch": 1.538181818181818, "eval_loss": 0.610755205154419, "eval_runtime": 27.2505, "eval_samples_per_second": 17.284, "eval_steps_per_second": 4.33, "step": 105 }, { "epoch": 1.6109090909090908, "grad_norm": 3.2959372997283936, "learning_rate": 7.462740339769324e-05, "loss": 2.0259, "step": 110 }, { "epoch": 1.6109090909090908, "eval_loss": 0.6130332350730896, "eval_runtime": 27.2607, "eval_samples_per_second": 17.278, "eval_steps_per_second": 4.329, "step": 110 }, { "epoch": 1.6836363636363636, "eval_loss": 0.6104254722595215, "eval_runtime": 27.2481, "eval_samples_per_second": 17.286, "eval_steps_per_second": 4.331, "step": 115 }, { "epoch": 1.7563636363636363, "grad_norm": 2.7095448970794678, "learning_rate": 6.883639468175927e-05, "loss": 1.8146, "step": 120 }, { "epoch": 1.7563636363636363, "eval_loss": 0.6095255613327026, "eval_runtime": 27.2419, "eval_samples_per_second": 17.29, "eval_steps_per_second": 4.332, "step": 120 }, { "epoch": 1.829090909090909, "eval_loss": 0.6098220944404602, "eval_runtime": 27.2413, "eval_samples_per_second": 17.29, "eval_steps_per_second": 4.332, "step": 125 }, { "epoch": 1.9018181818181819, "grad_norm": 3.7072970867156982, "learning_rate": 6.273355601206144e-05, "loss": 1.6977, "step": 130 }, { "epoch": 1.9018181818181819, "eval_loss": 0.6083605289459229, "eval_runtime": 27.262, "eval_samples_per_second": 17.277, "eval_steps_per_second": 4.328, "step": 130 }, { "epoch": 1.9745454545454546, "eval_loss": 0.6045976281166077, "eval_runtime": 27.2423, "eval_samples_per_second": 17.289, "eval_steps_per_second": 4.332, "step": 135 } ], "logging_steps": 10, "max_steps": 272, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.413948656205824e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }