{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 6.0, "learning_rate": 3.8000000000000005e-06, "loss": 1.4753, "mean_token_accuracy": 0.6514762923121452, "num_tokens": 9165.0, "step": 20 }, { "epoch": 0.04, "grad_norm": 6.84375, "learning_rate": 7.800000000000002e-06, "loss": 1.3604, "mean_token_accuracy": 0.6717274159193038, "num_tokens": 19130.0, "step": 40 }, { "epoch": 0.06, "grad_norm": 4.90625, "learning_rate": 1.18e-05, "loss": 1.2917, "mean_token_accuracy": 0.691003431379795, "num_tokens": 30279.0, "step": 60 }, { "epoch": 0.08, "grad_norm": 30.625, "learning_rate": 1.58e-05, "loss": 0.9447, "mean_token_accuracy": 0.7457822680473327, "num_tokens": 42533.0, "step": 80 }, { "epoch": 0.1, "grad_norm": 9.875, "learning_rate": 1.98e-05, "loss": 1.3137, "mean_token_accuracy": 0.6690018624067307, "num_tokens": 48242.0, "step": 100 }, { "epoch": 0.12, "grad_norm": 18.0, "learning_rate": 1.988886498744505e-05, "loss": 1.0214, "mean_token_accuracy": 0.7285905957221985, "num_tokens": 59331.0, "step": 120 }, { "epoch": 0.14, "grad_norm": 3.515625, "learning_rate": 1.953454172319001e-05, "loss": 1.0517, "mean_token_accuracy": 0.7213401407003402, "num_tokens": 68979.0, "step": 140 }, { "epoch": 0.16, "grad_norm": 5.9375, "learning_rate": 1.894544639838025e-05, "loss": 1.1587, "mean_token_accuracy": 0.7021817207336426, "num_tokens": 76070.0, "step": 160 }, { "epoch": 0.18, "grad_norm": 7.0625, "learning_rate": 1.8136084495007874e-05, "loss": 1.0255, "mean_token_accuracy": 0.7250557497143746, "num_tokens": 85738.0, "step": 180 }, { "epoch": 0.2, "grad_norm": 18.875, "learning_rate": 1.7126385189252055e-05, "loss": 1.1949, "mean_token_accuracy": 0.6837209314107895, "num_tokens": 94928.0, "step": 200 }, { "epoch": 0.22, "grad_norm": 7.5, "learning_rate": 1.594121062902039e-05, "loss": 1.0857, "mean_token_accuracy": 0.7113691180944443, "num_tokens": 104773.0, "step": 220 }, { "epoch": 0.24, "grad_norm": 6.625, "learning_rate": 1.4609743745354625e-05, "loss": 0.9951, "mean_token_accuracy": 0.7261969804763794, "num_tokens": 114064.0, "step": 240 }, { "epoch": 0.26, "grad_norm": 19.125, "learning_rate": 1.3164769671815862e-05, "loss": 1.0099, "mean_token_accuracy": 0.7223263427615165, "num_tokens": 123805.0, "step": 260 }, { "epoch": 0.28, "grad_norm": 5.03125, "learning_rate": 1.164186846568863e-05, "loss": 1.0713, "mean_token_accuracy": 0.7073337942361831, "num_tokens": 134324.0, "step": 280 }, { "epoch": 0.3, "grad_norm": 11.5, "learning_rate": 1.0078539008887114e-05, "loss": 0.9601, "mean_token_accuracy": 0.7360554546117782, "num_tokens": 143153.0, "step": 300 }, { "epoch": 0.32, "grad_norm": 7.9375, "learning_rate": 8.51327566103077e-06, "loss": 0.9354, "mean_token_accuracy": 0.7383034735918045, "num_tokens": 154603.0, "step": 320 }, { "epoch": 0.34, "grad_norm": 6.875, "learning_rate": 6.984620400555044e-06, "loss": 0.9676, "mean_token_accuracy": 0.7273150086402893, "num_tokens": 167175.0, "step": 340 }, { "epoch": 0.36, "grad_norm": 17.625, "learning_rate": 5.53021379328879e-06, "loss": 1.085, "mean_token_accuracy": 0.715969854593277, "num_tokens": 176751.0, "step": 360 }, { "epoch": 0.38, "grad_norm": 6.0625, "learning_rate": 4.185868156801695e-06, "loss": 1.0635, "mean_token_accuracy": 0.7170247107744216, "num_tokens": 185233.0, "step": 380 }, { "epoch": 0.4, "grad_norm": 7.25, "learning_rate": 2.9846857422914434e-06, "loss": 1.0838, "mean_token_accuracy": 0.7044813245534897, "num_tokens": 193072.0, "step": 400 }, { "epoch": 0.42, "grad_norm": 6.21875, "learning_rate": 1.956243647299155e-06, "loss": 1.006, "mean_token_accuracy": 0.7172566086053849, "num_tokens": 204372.0, "step": 420 }, { "epoch": 0.44, "grad_norm": 11.8125, "learning_rate": 1.1258655294071686e-06, "loss": 0.9791, "mean_token_accuracy": 0.727970740199089, "num_tokens": 216681.0, "step": 440 }, { "epoch": 0.46, "grad_norm": 5.5, "learning_rate": 5.13998053744954e-07, "loss": 1.0866, "mean_token_accuracy": 0.7147636383771896, "num_tokens": 226708.0, "step": 460 }, { "epoch": 0.48, "grad_norm": 11.5, "learning_rate": 1.357074282350457e-07, "loss": 1.01, "mean_token_accuracy": 0.7146593451499939, "num_tokens": 237866.0, "step": 480 }, { "epoch": 0.5, "grad_norm": 6.90625, "learning_rate": 3.0842355210336515e-10, "loss": 1.1541, "mean_token_accuracy": 0.68363136947155, "num_tokens": 245487.0, "step": 500 } ], "logging_steps": 20, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0414470479035392e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }