| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.35, | |
| "eval_steps": 100, | |
| "global_step": 700, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0125, | |
| "grad_norm": 12.285691261291504, | |
| "learning_rate": 2.5555555555555554e-05, | |
| "loss": 19.1959, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.025, | |
| "grad_norm": 1.0500800609588623, | |
| "learning_rate": 5.333333333333333e-05, | |
| "loss": 7.7392, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0375, | |
| "grad_norm": 0.8419748544692993, | |
| "learning_rate": 8.111111111111112e-05, | |
| "loss": 1.2479, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 1.2464624643325806, | |
| "learning_rate": 0.00010888888888888889, | |
| "loss": 0.7071, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "eval_loss": 0.5045744180679321, | |
| "eval_runtime": 690.6274, | |
| "eval_samples_per_second": 1.448, | |
| "eval_steps_per_second": 1.448, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0625, | |
| "grad_norm": 0.8873828053474426, | |
| "learning_rate": 0.00013666666666666666, | |
| "loss": 0.4035, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.075, | |
| "grad_norm": 0.9160810708999634, | |
| "learning_rate": 0.00016444444444444444, | |
| "loss": 0.2371, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.0875, | |
| "grad_norm": 0.7777899503707886, | |
| "learning_rate": 0.00019222222222222224, | |
| "loss": 0.1825, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.9546622037887573, | |
| "learning_rate": 0.0001999952797459453, | |
| "loss": 0.1328, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_loss": 0.12972435355186462, | |
| "eval_runtime": 689.9886, | |
| "eval_samples_per_second": 1.449, | |
| "eval_steps_per_second": 1.449, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.1125, | |
| "grad_norm": 0.6794782876968384, | |
| "learning_rate": 0.00019997306349823693, | |
| "loss": 0.1162, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.125, | |
| "grad_norm": 0.8062716126441956, | |
| "learning_rate": 0.00019993264143226513, | |
| "loss": 0.1048, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1375, | |
| "grad_norm": 0.8031811118125916, | |
| "learning_rate": 0.00019987402090918067, | |
| "loss": 0.0988, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.37140652537345886, | |
| "learning_rate": 0.000199797212604205, | |
| "loss": 0.0885, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "eval_loss": 0.09123973548412323, | |
| "eval_runtime": 689.6116, | |
| "eval_samples_per_second": 1.45, | |
| "eval_steps_per_second": 1.45, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.1625, | |
| "grad_norm": 0.3891146779060364, | |
| "learning_rate": 0.0001997022305046862, | |
| "loss": 0.0797, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.175, | |
| "grad_norm": 0.2549072504043579, | |
| "learning_rate": 0.00019958909190755187, | |
| "loss": 0.0795, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.1875, | |
| "grad_norm": 0.16435463726520538, | |
| "learning_rate": 0.0001994578174161592, | |
| "loss": 0.0774, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 0.9256812930107117, | |
| "learning_rate": 0.00019930843093654305, | |
| "loss": 0.0737, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_loss": 0.0729941874742508, | |
| "eval_runtime": 690.0692, | |
| "eval_samples_per_second": 1.449, | |
| "eval_steps_per_second": 1.449, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2125, | |
| "grad_norm": 0.2688423693180084, | |
| "learning_rate": 0.00019914095967306223, | |
| "loss": 0.0703, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.225, | |
| "grad_norm": 0.24806413054466248, | |
| "learning_rate": 0.0001989554341234457, | |
| "loss": 0.0661, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.2375, | |
| "grad_norm": 0.2767386734485626, | |
| "learning_rate": 0.00019875188807323842, | |
| "loss": 0.0675, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.120990090072155, | |
| "learning_rate": 0.00019853035858964906, | |
| "loss": 0.0666, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "eval_loss": 0.06402209401130676, | |
| "eval_runtime": 690.2132, | |
| "eval_samples_per_second": 1.449, | |
| "eval_steps_per_second": 1.449, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.2625, | |
| "grad_norm": 0.17512480914592743, | |
| "learning_rate": 0.0001982908860147995, | |
| "loss": 0.0634, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.275, | |
| "grad_norm": 0.21513459086418152, | |
| "learning_rate": 0.00019803351395837848, | |
| "loss": 0.0667, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.2875, | |
| "grad_norm": 0.18517647683620453, | |
| "learning_rate": 0.00019775828928969975, | |
| "loss": 0.0626, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 0.15308107435703278, | |
| "learning_rate": 0.00019746526212916705, | |
| "loss": 0.0603, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "eval_loss": 0.06020086258649826, | |
| "eval_runtime": 690.5491, | |
| "eval_samples_per_second": 1.448, | |
| "eval_steps_per_second": 1.448, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.3125, | |
| "grad_norm": 0.1161598339676857, | |
| "learning_rate": 0.00019715448583914659, | |
| "loss": 0.06, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.325, | |
| "grad_norm": 0.13704052567481995, | |
| "learning_rate": 0.0001968260170142496, | |
| "loss": 0.0611, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.3375, | |
| "grad_norm": 0.1627456545829773, | |
| "learning_rate": 0.0001964799154710258, | |
| "loss": 0.0595, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 0.14293187856674194, | |
| "learning_rate": 0.0001961162442370707, | |
| "loss": 0.0606, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "eval_loss": 0.05934726819396019, | |
| "eval_runtime": 690.4657, | |
| "eval_samples_per_second": 1.448, | |
| "eval_steps_per_second": 1.448, | |
| "step": 700 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 6000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.42445094371328e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |