| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.2, | |
| "eval_steps": 50, | |
| "global_step": 1000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 14.438831329345703, | |
| "learning_rate": 0.00015, | |
| "loss": 1.116, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.01, | |
| "eval_loss": 0.8588868975639343, | |
| "eval_runtime": 680.2426, | |
| "eval_samples_per_second": 8.601, | |
| "eval_steps_per_second": 1.076, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 1.0438328981399536, | |
| "learning_rate": 0.0003, | |
| "loss": 0.723, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "eval_loss": 0.8119707703590393, | |
| "eval_runtime": 681.2671, | |
| "eval_samples_per_second": 8.588, | |
| "eval_steps_per_second": 1.074, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 0.8140527009963989, | |
| "learning_rate": 0.0002833333333333333, | |
| "loss": 0.6987, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "eval_loss": 0.7736836075782776, | |
| "eval_runtime": 681.2882, | |
| "eval_samples_per_second": 8.588, | |
| "eval_steps_per_second": 1.074, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.7538554668426514, | |
| "learning_rate": 0.0002666666666666666, | |
| "loss": 0.6701, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_loss": 0.7494252324104309, | |
| "eval_runtime": 681.1585, | |
| "eval_samples_per_second": 8.59, | |
| "eval_steps_per_second": 1.075, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 0.8082383871078491, | |
| "learning_rate": 0.00025, | |
| "loss": 0.6585, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_loss": 0.7359848022460938, | |
| "eval_runtime": 681.3996, | |
| "eval_samples_per_second": 8.587, | |
| "eval_steps_per_second": 1.074, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 0.7390263080596924, | |
| "learning_rate": 0.0002333333333333333, | |
| "loss": 0.6451, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "eval_loss": 0.7315810322761536, | |
| "eval_runtime": 680.7597, | |
| "eval_samples_per_second": 8.595, | |
| "eval_steps_per_second": 1.075, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 0.719605565071106, | |
| "learning_rate": 0.00021666666666666666, | |
| "loss": 0.6382, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "eval_loss": 0.6773383617401123, | |
| "eval_runtime": 680.8905, | |
| "eval_samples_per_second": 8.593, | |
| "eval_steps_per_second": 1.075, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.8071795701980591, | |
| "learning_rate": 0.00019999999999999998, | |
| "loss": 0.6304, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_loss": 0.6790196299552917, | |
| "eval_runtime": 680.4041, | |
| "eval_samples_per_second": 8.599, | |
| "eval_steps_per_second": 1.076, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 0.6562775373458862, | |
| "learning_rate": 0.00018333333333333334, | |
| "loss": 0.6236, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "eval_loss": 0.667582631111145, | |
| "eval_runtime": 680.9279, | |
| "eval_samples_per_second": 8.593, | |
| "eval_steps_per_second": 1.075, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.6858498454093933, | |
| "learning_rate": 0.00016666666666666666, | |
| "loss": 0.6127, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_loss": 0.6611046195030212, | |
| "eval_runtime": 681.3448, | |
| "eval_samples_per_second": 8.587, | |
| "eval_steps_per_second": 1.074, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.7050228714942932, | |
| "learning_rate": 0.00015, | |
| "loss": 0.6109, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "eval_loss": 0.6624142527580261, | |
| "eval_runtime": 680.8132, | |
| "eval_samples_per_second": 8.594, | |
| "eval_steps_per_second": 1.075, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 0.7099040746688843, | |
| "learning_rate": 0.0001333333333333333, | |
| "loss": 0.6074, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_loss": 0.6503908634185791, | |
| "eval_runtime": 680.9458, | |
| "eval_samples_per_second": 8.592, | |
| "eval_steps_per_second": 1.075, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.7423191070556641, | |
| "learning_rate": 0.00011666666666666665, | |
| "loss": 0.5972, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "eval_loss": 0.6401746273040771, | |
| "eval_runtime": 680.6123, | |
| "eval_samples_per_second": 8.597, | |
| "eval_steps_per_second": 1.076, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 0.6712120175361633, | |
| "learning_rate": 9.999999999999999e-05, | |
| "loss": 0.5912, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "eval_loss": 0.6332426071166992, | |
| "eval_runtime": 680.7596, | |
| "eval_samples_per_second": 8.595, | |
| "eval_steps_per_second": 1.075, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.7194417715072632, | |
| "learning_rate": 8.333333333333333e-05, | |
| "loss": 0.5934, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "eval_loss": 0.6242749094963074, | |
| "eval_runtime": 679.731, | |
| "eval_samples_per_second": 8.608, | |
| "eval_steps_per_second": 1.077, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.7845382690429688, | |
| "learning_rate": 6.666666666666666e-05, | |
| "loss": 0.5908, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_loss": 0.6116130352020264, | |
| "eval_runtime": 681.1204, | |
| "eval_samples_per_second": 8.59, | |
| "eval_steps_per_second": 1.075, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.7342799305915833, | |
| "learning_rate": 4.9999999999999996e-05, | |
| "loss": 0.5824, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "eval_loss": 0.6023569703102112, | |
| "eval_runtime": 680.663, | |
| "eval_samples_per_second": 8.596, | |
| "eval_steps_per_second": 1.075, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 0.6745529174804688, | |
| "learning_rate": 3.333333333333333e-05, | |
| "loss": 0.5823, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "eval_loss": 0.5980194211006165, | |
| "eval_runtime": 680.2295, | |
| "eval_samples_per_second": 8.602, | |
| "eval_steps_per_second": 1.076, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 0.7388492226600647, | |
| "learning_rate": 1.6666666666666664e-05, | |
| "loss": 0.5876, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "eval_loss": 0.5954298973083496, | |
| "eval_runtime": 680.8083, | |
| "eval_samples_per_second": 8.594, | |
| "eval_steps_per_second": 1.075, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.8207566738128662, | |
| "learning_rate": 0.0, | |
| "loss": 0.5748, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_loss": 0.5945320725440979, | |
| "eval_runtime": 680.8349, | |
| "eval_samples_per_second": 8.594, | |
| "eval_steps_per_second": 1.075, | |
| "step": 1000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 1000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.187924717080576e+16, | |
| "train_batch_size": 10, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |