| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.999559277214632, | |
| "eval_steps": 500, | |
| "global_step": 567, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.0017628911414720142, | |
| "grad_norm": 0.6059878554390732, | |
| "kl": 0.0, | |
| "learning_rate": 1.7543859649122805e-08, | |
| "loss": 0.0179, | |
| "max_completion_length": 480.0, | |
| "max_terminated_completion_length": 480.0, | |
| "mean_completion_length": 269.765625, | |
| "mean_terminated_completion_length": 269.765625, | |
| "min_completion_length": 117.25, | |
| "min_terminated_completion_length": 117.25, | |
| "num_tokens": 153540.0, | |
| "reward": 0.5330176055431366, | |
| "reward_std": 0.3266839161515236, | |
| "rewards/format_reward/mean": 0.29296875, | |
| "rewards/format_reward/std": 0.4541962593793869, | |
| "rewards/qatch_metrics/mean": 0.5523437634110451, | |
| "rewards/qatch_metrics/std": 0.4678479805588722, | |
| "rewards/tag_count_reward/mean": 0.6845703125, | |
| "rewards/tag_count_reward/std": 0.25720928236842155, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0009765625, | |
| "epoch": 0.00881445570736007, | |
| "grad_norm": 0.693657549234007, | |
| "kl": 0.00022602081298828125, | |
| "learning_rate": 8.771929824561403e-08, | |
| "loss": 0.063, | |
| "max_completion_length": 760.875, | |
| "max_terminated_completion_length": 537.8125, | |
| "mean_completion_length": 287.8251953125, | |
| "mean_terminated_completion_length": 284.0889148712158, | |
| "min_completion_length": 123.6875, | |
| "min_terminated_completion_length": 123.6875, | |
| "num_tokens": 822993.0, | |
| "reward": 0.366003917530179, | |
| "reward_std": 0.26989864744246006, | |
| "rewards/format_reward/mean": 0.3115234375, | |
| "rewards/format_reward/std": 0.45695164799690247, | |
| "rewards/qatch_metrics/mean": 0.35500976350158453, | |
| "rewards/qatch_metrics/std": 0.42219158448278904, | |
| "rewards/tag_count_reward/mean": 0.661865234375, | |
| "rewards/tag_count_reward/std": 0.30064977053552866, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.01762891141472014, | |
| "grad_norm": 0.6178086084314489, | |
| "kl": 0.00030994415283203125, | |
| "learning_rate": 1.7543859649122805e-07, | |
| "loss": 0.0434, | |
| "max_completion_length": 719.35, | |
| "max_terminated_completion_length": 541.05, | |
| "mean_completion_length": 288.9390625, | |
| "mean_terminated_completion_length": 285.9650680541992, | |
| "min_completion_length": 109.55, | |
| "min_terminated_completion_length": 109.55, | |
| "num_tokens": 1683731.0, | |
| "reward": 0.3988192930817604, | |
| "reward_std": 0.26640091091394424, | |
| "rewards/format_reward/mean": 0.2875, | |
| "rewards/format_reward/std": 0.4482169449329376, | |
| "rewards/qatch_metrics/mean": 0.3975424602627754, | |
| "rewards/qatch_metrics/std": 0.41136824041604997, | |
| "rewards/tag_count_reward/mean": 0.6431640625, | |
| "rewards/tag_count_reward/std": 0.30217186361551285, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00234375, | |
| "epoch": 0.026443367122080213, | |
| "grad_norm": 0.5632927725681783, | |
| "kl": 0.00031604766845703123, | |
| "learning_rate": 2.631578947368421e-07, | |
| "loss": 0.0608, | |
| "max_completion_length": 1081.55, | |
| "max_terminated_completion_length": 586.1, | |
| "mean_completion_length": 304.02890625, | |
| "mean_terminated_completion_length": 295.2437545776367, | |
| "min_completion_length": 109.35, | |
| "min_terminated_completion_length": 109.35, | |
| "num_tokens": 2511848.0, | |
| "reward": 0.3817859634757042, | |
| "reward_std": 0.27212979570031165, | |
| "rewards/format_reward/mean": 0.3171875, | |
| "rewards/format_reward/std": 0.46050114631652833, | |
| "rewards/qatch_metrics/mean": 0.3732638031244278, | |
| "rewards/qatch_metrics/std": 0.4210278898477554, | |
| "rewards/tag_count_reward/mean": 0.655859375, | |
| "rewards/tag_count_reward/std": 0.3017656706273556, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00234375, | |
| "epoch": 0.03525782282944028, | |
| "grad_norm": 0.6261671380195042, | |
| "kl": 0.0003917694091796875, | |
| "learning_rate": 3.508771929824561e-07, | |
| "loss": 0.0577, | |
| "max_completion_length": 1088.0, | |
| "max_terminated_completion_length": 559.35, | |
| "mean_completion_length": 294.12421875, | |
| "mean_terminated_completion_length": 285.19422454833983, | |
| "min_completion_length": 111.0, | |
| "min_terminated_completion_length": 111.0, | |
| "num_tokens": 3349911.0, | |
| "reward": 0.44279307052493094, | |
| "reward_std": 0.28929525390267374, | |
| "rewards/format_reward/mean": 0.36015625, | |
| "rewards/format_reward/std": 0.47752839177846906, | |
| "rewards/qatch_metrics/mean": 0.43892474174499513, | |
| "rewards/qatch_metrics/std": 0.42011758461594584, | |
| "rewards/tag_count_reward/mean": 0.673828125, | |
| "rewards/tag_count_reward/std": 0.30838444381952285, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.044072278536800354, | |
| "grad_norm": 0.714407937029021, | |
| "kl": 0.0008548736572265625, | |
| "learning_rate": 4.3859649122807013e-07, | |
| "loss": 0.0408, | |
| "max_completion_length": 582.0, | |
| "max_terminated_completion_length": 582.0, | |
| "mean_completion_length": 271.83125, | |
| "mean_terminated_completion_length": 271.83125, | |
| "min_completion_length": 108.1, | |
| "min_terminated_completion_length": 108.1, | |
| "num_tokens": 4160111.0, | |
| "reward": 0.42091118171811104, | |
| "reward_std": 0.2539832413196564, | |
| "rewards/format_reward/mean": 0.47265625, | |
| "rewards/format_reward/std": 0.49103155434131623, | |
| "rewards/qatch_metrics/mean": 0.3958674557507038, | |
| "rewards/qatch_metrics/std": 0.39918228760361674, | |
| "rewards/tag_count_reward/mean": 0.7431640625, | |
| "rewards/tag_count_reward/std": 0.2822920955717564, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.052886734244160426, | |
| "grad_norm": 0.6982184095943039, | |
| "kl": 0.002931976318359375, | |
| "learning_rate": 5.263157894736842e-07, | |
| "loss": 0.0763, | |
| "max_completion_length": 901.3, | |
| "max_terminated_completion_length": 546.6, | |
| "mean_completion_length": 251.38359375, | |
| "mean_terminated_completion_length": 245.34597091674806, | |
| "min_completion_length": 95.9, | |
| "min_terminated_completion_length": 95.9, | |
| "num_tokens": 4949066.0, | |
| "reward": 0.44982930943369864, | |
| "reward_std": 0.26805768758058546, | |
| "rewards/format_reward/mean": 0.67265625, | |
| "rewards/format_reward/std": 0.46562533974647524, | |
| "rewards/qatch_metrics/mean": 0.40089062303304673, | |
| "rewards/qatch_metrics/std": 0.416074800491333, | |
| "rewards/tag_count_reward/mean": 0.8361328125, | |
| "rewards/tag_count_reward/std": 0.24939355850219727, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.06170118995152049, | |
| "grad_norm": 0.6981838956357035, | |
| "kl": 0.005738067626953125, | |
| "learning_rate": 6.140350877192982e-07, | |
| "loss": 0.0481, | |
| "max_completion_length": 677.45, | |
| "max_terminated_completion_length": 491.75, | |
| "mean_completion_length": 215.7171875, | |
| "mean_terminated_completion_length": 212.69117126464843, | |
| "min_completion_length": 93.75, | |
| "min_terminated_completion_length": 93.75, | |
| "num_tokens": 5670128.0, | |
| "reward": 0.507285387814045, | |
| "reward_std": 0.2766963288187981, | |
| "rewards/format_reward/mean": 0.81953125, | |
| "rewards/format_reward/std": 0.3775612235069275, | |
| "rewards/qatch_metrics/mean": 0.4468752659857273, | |
| "rewards/qatch_metrics/std": 0.41472957879304884, | |
| "rewards/tag_count_reward/mean": 0.909765625, | |
| "rewards/tag_count_reward/std": 0.1821597468107939, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.07051564565888056, | |
| "grad_norm": 0.7068871695919721, | |
| "kl": 0.01212921142578125, | |
| "learning_rate": 7.017543859649122e-07, | |
| "loss": 0.0372, | |
| "max_completion_length": 455.0, | |
| "max_terminated_completion_length": 455.0, | |
| "mean_completion_length": 195.53359375, | |
| "mean_terminated_completion_length": 195.53359375, | |
| "min_completion_length": 88.55, | |
| "min_terminated_completion_length": 88.55, | |
| "num_tokens": 6395659.0, | |
| "reward": 0.42992587983608244, | |
| "reward_std": 0.24781498908996583, | |
| "rewards/format_reward/mean": 0.93671875, | |
| "rewards/format_reward/std": 0.22327023521065711, | |
| "rewards/qatch_metrics/mean": 0.3395497426390648, | |
| "rewards/qatch_metrics/std": 0.42959350496530535, | |
| "rewards/tag_count_reward/mean": 0.952734375, | |
| "rewards/tag_count_reward/std": 0.1316565966233611, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.07933010136624064, | |
| "grad_norm": 0.6318192986531537, | |
| "kl": 0.0152313232421875, | |
| "learning_rate": 7.894736842105263e-07, | |
| "loss": 0.0262, | |
| "max_completion_length": 561.0, | |
| "max_terminated_completion_length": 561.0, | |
| "mean_completion_length": 179.49296875, | |
| "mean_terminated_completion_length": 179.49296875, | |
| "min_completion_length": 89.35, | |
| "min_terminated_completion_length": 89.35, | |
| "num_tokens": 7123970.0, | |
| "reward": 0.4788477897644043, | |
| "reward_std": 0.22096828632056714, | |
| "rewards/format_reward/mean": 0.9734375, | |
| "rewards/format_reward/std": 0.13465526476502418, | |
| "rewards/qatch_metrics/mean": 0.39125704020261765, | |
| "rewards/qatch_metrics/std": 0.42074447572231294, | |
| "rewards/tag_count_reward/mean": 0.9787109375, | |
| "rewards/tag_count_reward/std": 0.07670327685773373, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.08814455707360071, | |
| "grad_norm": 0.824731355900669, | |
| "kl": 0.0237152099609375, | |
| "learning_rate": 8.771929824561403e-07, | |
| "loss": 0.0207, | |
| "max_completion_length": 580.35, | |
| "max_terminated_completion_length": 416.4, | |
| "mean_completion_length": 175.06328125, | |
| "mean_terminated_completion_length": 172.03970794677736, | |
| "min_completion_length": 86.15, | |
| "min_terminated_completion_length": 86.15, | |
| "num_tokens": 7825859.0, | |
| "reward": 0.5201250776648522, | |
| "reward_std": 0.26422476917505267, | |
| "rewards/format_reward/mean": 0.98046875, | |
| "rewards/format_reward/std": 0.09858547002077103, | |
| "rewards/qatch_metrics/mean": 0.43842838853597643, | |
| "rewards/qatch_metrics/std": 0.4300079450011253, | |
| "rewards/tag_count_reward/mean": 0.98828125, | |
| "rewards/tag_count_reward/std": 0.05498028658330441, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.09695901278096078, | |
| "grad_norm": 0.6613474760126196, | |
| "kl": 0.0172088623046875, | |
| "learning_rate": 9.649122807017545e-07, | |
| "loss": 0.0294, | |
| "max_completion_length": 602.3, | |
| "max_terminated_completion_length": 427.75, | |
| "mean_completion_length": 183.23203125, | |
| "mean_terminated_completion_length": 180.19742126464843, | |
| "min_completion_length": 89.5, | |
| "min_terminated_completion_length": 89.5, | |
| "num_tokens": 8528012.0, | |
| "reward": 0.553113266825676, | |
| "reward_std": 0.21849482469260692, | |
| "rewards/format_reward/mean": 0.97890625, | |
| "rewards/format_reward/std": 0.11011371463537216, | |
| "rewards/qatch_metrics/mean": 0.47743333876132965, | |
| "rewards/qatch_metrics/std": 0.44010845869779586, | |
| "rewards/tag_count_reward/mean": 0.9880859375, | |
| "rewards/tag_count_reward/std": 0.05850886330008507, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.10577346848832085, | |
| "grad_norm": 0.6522964666336608, | |
| "kl": 0.014208984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0261, | |
| "max_completion_length": 485.3, | |
| "max_terminated_completion_length": 485.3, | |
| "mean_completion_length": 209.71796875, | |
| "mean_terminated_completion_length": 209.71796875, | |
| "min_completion_length": 94.15, | |
| "min_terminated_completion_length": 94.15, | |
| "num_tokens": 9233251.0, | |
| "reward": 0.4863431349396706, | |
| "reward_std": 0.2265178494155407, | |
| "rewards/format_reward/mean": 0.95078125, | |
| "rewards/format_reward/std": 0.2008387751877308, | |
| "rewards/qatch_metrics/mean": 0.4027749992907047, | |
| "rewards/qatch_metrics/std": 0.4363295495510101, | |
| "rewards/tag_count_reward/mean": 0.978125, | |
| "rewards/tag_count_reward/std": 0.08785357438027859, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.11458792419568092, | |
| "grad_norm": 0.5373906090936745, | |
| "kl": 0.010211181640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0351, | |
| "max_completion_length": 659.45, | |
| "max_terminated_completion_length": 475.2, | |
| "mean_completion_length": 233.5375, | |
| "mean_terminated_completion_length": 230.51361694335938, | |
| "min_completion_length": 97.15, | |
| "min_terminated_completion_length": 97.15, | |
| "num_tokens": 10003955.0, | |
| "reward": 0.4962065383791924, | |
| "reward_std": 0.2374630995094776, | |
| "rewards/format_reward/mean": 0.9296875, | |
| "rewards/format_reward/std": 0.2426914632320404, | |
| "rewards/qatch_metrics/mean": 0.4170789122581482, | |
| "rewards/qatch_metrics/std": 0.4158875457942486, | |
| "rewards/tag_count_reward/mean": 0.9744140625, | |
| "rewards/tag_count_reward/std": 0.09585625268518924, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.12340237990304098, | |
| "grad_norm": 0.5390725877788498, | |
| "kl": 0.0112091064453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0309, | |
| "max_completion_length": 699.8, | |
| "max_terminated_completion_length": 523.05, | |
| "mean_completion_length": 260.1734375, | |
| "mean_terminated_completion_length": 257.2174865722656, | |
| "min_completion_length": 105.05, | |
| "min_terminated_completion_length": 105.05, | |
| "num_tokens": 10821969.0, | |
| "reward": 0.4819660037755966, | |
| "reward_std": 0.24495334178209305, | |
| "rewards/format_reward/mean": 0.91171875, | |
| "rewards/format_reward/std": 0.2768270045518875, | |
| "rewards/qatch_metrics/mean": 0.4029333367943764, | |
| "rewards/qatch_metrics/std": 0.40151628404855727, | |
| "rewards/tag_count_reward/mean": 0.966015625, | |
| "rewards/tag_count_reward/std": 0.11766693852841854, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.13221683561040107, | |
| "grad_norm": 0.5095226818805257, | |
| "kl": 0.0107269287109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0607, | |
| "max_completion_length": 914.05, | |
| "max_terminated_completion_length": 550.2, | |
| "mean_completion_length": 280.80703125, | |
| "mean_terminated_completion_length": 274.77569580078125, | |
| "min_completion_length": 106.75, | |
| "min_terminated_completion_length": 106.75, | |
| "num_tokens": 11687082.0, | |
| "reward": 0.5201233088970184, | |
| "reward_std": 0.23353515826165677, | |
| "rewards/format_reward/mean": 0.92109375, | |
| "rewards/format_reward/std": 0.26293098405003545, | |
| "rewards/qatch_metrics/mean": 0.446629437059164, | |
| "rewards/qatch_metrics/std": 0.39764614775776863, | |
| "rewards/tag_count_reward/mean": 0.967578125, | |
| "rewards/tag_count_reward/std": 0.11440482027828694, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.14103129131776113, | |
| "grad_norm": 0.5965131581598656, | |
| "kl": 0.01302490234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0046, | |
| "max_completion_length": 535.1, | |
| "max_terminated_completion_length": 535.1, | |
| "mean_completion_length": 250.803125, | |
| "mean_terminated_completion_length": 250.803125, | |
| "min_completion_length": 103.9, | |
| "min_terminated_completion_length": 103.9, | |
| "num_tokens": 12474846.0, | |
| "reward": 0.602032545208931, | |
| "reward_std": 0.25395786538720133, | |
| "rewards/format_reward/mean": 0.95078125, | |
| "rewards/format_reward/std": 0.2034299425780773, | |
| "rewards/qatch_metrics/mean": 0.538696362823248, | |
| "rewards/qatch_metrics/std": 0.4360169142484665, | |
| "rewards/tag_count_reward/mean": 0.98125, | |
| "rewards/tag_count_reward/std": 0.08756073787808419, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.003125, | |
| "epoch": 0.1498457470251212, | |
| "grad_norm": 0.5743594639855633, | |
| "kl": 0.015057373046875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0265, | |
| "max_completion_length": 1039.05, | |
| "max_terminated_completion_length": 527.6, | |
| "mean_completion_length": 254.18125, | |
| "mean_terminated_completion_length": 245.13028564453126, | |
| "min_completion_length": 106.7, | |
| "min_terminated_completion_length": 106.7, | |
| "num_tokens": 13288870.0, | |
| "reward": 0.5227034568786622, | |
| "reward_std": 0.23749643117189406, | |
| "rewards/format_reward/mean": 0.95546875, | |
| "rewards/format_reward/std": 0.19548083767294883, | |
| "rewards/qatch_metrics/mean": 0.44466719292104245, | |
| "rewards/qatch_metrics/std": 0.412992362678051, | |
| "rewards/tag_count_reward/mean": 0.9837890625, | |
| "rewards/tag_count_reward/std": 0.08036210816353559, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.15866020273248127, | |
| "grad_norm": 0.6000853571666552, | |
| "kl": 0.016534423828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0333, | |
| "max_completion_length": 880.85, | |
| "max_terminated_completion_length": 529.65, | |
| "mean_completion_length": 254.89921875, | |
| "mean_terminated_completion_length": 248.93614959716797, | |
| "min_completion_length": 105.55, | |
| "min_terminated_completion_length": 105.55, | |
| "num_tokens": 14081621.0, | |
| "reward": 0.5267416775226593, | |
| "reward_std": 0.19439699612557887, | |
| "rewards/format_reward/mean": 0.96328125, | |
| "rewards/format_reward/std": 0.1678739033639431, | |
| "rewards/qatch_metrics/mean": 0.4485104277729988, | |
| "rewards/qatch_metrics/std": 0.41193409264087677, | |
| "rewards/tag_count_reward/mean": 0.98359375, | |
| "rewards/tag_count_reward/std": 0.08247525915503502, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.16747465843984133, | |
| "grad_norm": 0.6063983354937539, | |
| "kl": 0.018707275390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0261, | |
| "max_completion_length": 483.2, | |
| "max_terminated_completion_length": 483.2, | |
| "mean_completion_length": 244.6265625, | |
| "mean_terminated_completion_length": 244.6265625, | |
| "min_completion_length": 101.3, | |
| "min_terminated_completion_length": 101.3, | |
| "num_tokens": 14830791.0, | |
| "reward": 0.627327187359333, | |
| "reward_std": 0.22187515757977963, | |
| "rewards/format_reward/mean": 0.96953125, | |
| "rewards/format_reward/std": 0.1480187714099884, | |
| "rewards/qatch_metrics/mean": 0.5659846290946007, | |
| "rewards/qatch_metrics/std": 0.39404729604721067, | |
| "rewards/tag_count_reward/mean": 0.9857421875, | |
| "rewards/tag_count_reward/std": 0.06731439363211393, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.17628911414720141, | |
| "grad_norm": 0.4711194227560878, | |
| "kl": 0.0207275390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.018, | |
| "max_completion_length": 716.1, | |
| "max_terminated_completion_length": 533.9, | |
| "mean_completion_length": 291.71640625, | |
| "mean_terminated_completion_length": 288.71960601806643, | |
| "min_completion_length": 114.7, | |
| "min_terminated_completion_length": 114.7, | |
| "num_tokens": 15697436.0, | |
| "reward": 0.5580606862902642, | |
| "reward_std": 0.22751006074249744, | |
| "rewards/format_reward/mean": 0.9609375, | |
| "rewards/format_reward/std": 0.1678817868232727, | |
| "rewards/qatch_metrics/mean": 0.485528651624918, | |
| "rewards/qatch_metrics/std": 0.40959695875644686, | |
| "rewards/tag_count_reward/mean": 0.9853515625, | |
| "rewards/tag_count_reward/std": 0.06472647916525602, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.18510356985456147, | |
| "grad_norm": 0.4968226526172856, | |
| "kl": 0.023779296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0143, | |
| "max_completion_length": 581.7, | |
| "max_terminated_completion_length": 581.7, | |
| "mean_completion_length": 303.9078125, | |
| "mean_terminated_completion_length": 303.9078125, | |
| "min_completion_length": 129.2, | |
| "min_terminated_completion_length": 129.2, | |
| "num_tokens": 16533030.0, | |
| "reward": 0.5476276561617851, | |
| "reward_std": 0.2098201669752598, | |
| "rewards/format_reward/mean": 0.94296875, | |
| "rewards/format_reward/std": 0.2110932193696499, | |
| "rewards/qatch_metrics/mean": 0.47616121284663676, | |
| "rewards/qatch_metrics/std": 0.39544836431741714, | |
| "rewards/tag_count_reward/mean": 0.971875, | |
| "rewards/tag_count_reward/std": 0.10505668371915818, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.19391802556192156, | |
| "grad_norm": 0.4627573599511681, | |
| "kl": 0.02086181640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0245, | |
| "max_completion_length": 859.2, | |
| "max_terminated_completion_length": 857.75, | |
| "mean_completion_length": 299.94453125, | |
| "mean_terminated_completion_length": 294.0992431640625, | |
| "min_completion_length": 120.65, | |
| "min_terminated_completion_length": 120.65, | |
| "num_tokens": 17365423.0, | |
| "reward": 0.6622235596179962, | |
| "reward_std": 0.22427483648061752, | |
| "rewards/format_reward/mean": 0.95390625, | |
| "rewards/format_reward/std": 0.1916600726544857, | |
| "rewards/qatch_metrics/mean": 0.6095322921872139, | |
| "rewards/qatch_metrics/std": 0.391602248698473, | |
| "rewards/tag_count_reward/mean": 0.974609375, | |
| "rewards/tag_count_reward/std": 0.1027901167050004, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.20273248126928162, | |
| "grad_norm": 0.5445734630838104, | |
| "kl": 0.02449951171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0229, | |
| "max_completion_length": 668.95, | |
| "max_terminated_completion_length": 487.7, | |
| "mean_completion_length": 257.953125, | |
| "mean_terminated_completion_length": 254.92896881103516, | |
| "min_completion_length": 113.05, | |
| "min_terminated_completion_length": 113.05, | |
| "num_tokens": 18177315.0, | |
| "reward": 0.6399502992630005, | |
| "reward_std": 0.21523846834897994, | |
| "rewards/format_reward/mean": 0.97421875, | |
| "rewards/format_reward/std": 0.1374749183654785, | |
| "rewards/qatch_metrics/mean": 0.5802724003791809, | |
| "rewards/qatch_metrics/std": 0.39097184464335444, | |
| "rewards/tag_count_reward/mean": 0.9859375, | |
| "rewards/tag_count_reward/std": 0.06708128694444895, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.2115469369766417, | |
| "grad_norm": 0.5383517548619853, | |
| "kl": 0.023583984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0218, | |
| "max_completion_length": 662.9, | |
| "max_terminated_completion_length": 490.35, | |
| "mean_completion_length": 226.99609375, | |
| "mean_terminated_completion_length": 223.9852066040039, | |
| "min_completion_length": 98.45, | |
| "min_terminated_completion_length": 98.45, | |
| "num_tokens": 18970766.0, | |
| "reward": 0.6096899516880512, | |
| "reward_std": 0.1784604934975505, | |
| "rewards/format_reward/mean": 0.9875, | |
| "rewards/format_reward/std": 0.08070731684565544, | |
| "rewards/qatch_metrics/mean": 0.5428682203404606, | |
| "rewards/qatch_metrics/std": 0.3807223953306675, | |
| "rewards/tag_count_reward/mean": 0.9900390625, | |
| "rewards/tag_count_reward/std": 0.05316873826086521, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.22036139268400176, | |
| "grad_norm": 0.6190508245172893, | |
| "kl": 0.023504638671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0034, | |
| "max_completion_length": 443.6, | |
| "max_terminated_completion_length": 443.6, | |
| "mean_completion_length": 217.91875, | |
| "mean_terminated_completion_length": 217.91875, | |
| "min_completion_length": 92.4, | |
| "min_terminated_completion_length": 92.4, | |
| "num_tokens": 19707062.0, | |
| "reward": 0.6478345990180969, | |
| "reward_std": 0.1982966311275959, | |
| "rewards/format_reward/mean": 0.98828125, | |
| "rewards/format_reward/std": 0.07818891182541847, | |
| "rewards/qatch_metrics/mean": 0.5875489667057991, | |
| "rewards/qatch_metrics/std": 0.40511306300759314, | |
| "rewards/tag_count_reward/mean": 0.991796875, | |
| "rewards/tag_count_reward/std": 0.04425293505191803, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.22917584839136185, | |
| "grad_norm": 0.6061791997511771, | |
| "kl": 0.02451171875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0092, | |
| "max_completion_length": 454.6, | |
| "max_terminated_completion_length": 454.6, | |
| "mean_completion_length": 216.0890625, | |
| "mean_terminated_completion_length": 216.0890625, | |
| "min_completion_length": 92.95, | |
| "min_terminated_completion_length": 92.95, | |
| "num_tokens": 20424200.0, | |
| "reward": 0.6180633679032326, | |
| "reward_std": 0.20781082864850758, | |
| "rewards/format_reward/mean": 0.98984375, | |
| "rewards/format_reward/std": 0.06632362008094787, | |
| "rewards/qatch_metrics/mean": 0.5522252649068833, | |
| "rewards/qatch_metrics/std": 0.39786413311958313, | |
| "rewards/tag_count_reward/mean": 0.99375, | |
| "rewards/tag_count_reward/std": 0.03493789285421371, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.2379903040987219, | |
| "grad_norm": 0.6951125365759069, | |
| "kl": 0.02432861328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0351, | |
| "max_completion_length": 651.5, | |
| "max_terminated_completion_length": 463.9, | |
| "mean_completion_length": 229.290625, | |
| "mean_terminated_completion_length": 226.12052459716796, | |
| "min_completion_length": 97.7, | |
| "min_terminated_completion_length": 97.7, | |
| "num_tokens": 21182988.0, | |
| "reward": 0.6133343994617462, | |
| "reward_std": 0.22021852899342775, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.10100396648049355, | |
| "rewards/qatch_metrics/mean": 0.5476153731346131, | |
| "rewards/qatch_metrics/std": 0.39921322241425516, | |
| "rewards/tag_count_reward/mean": 0.9884765625, | |
| "rewards/tag_count_reward/std": 0.0593818049877882, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.24680475980608196, | |
| "grad_norm": 0.5382587655260813, | |
| "kl": 0.0250244140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0385, | |
| "max_completion_length": 669.15, | |
| "max_terminated_completion_length": 484.2, | |
| "mean_completion_length": 240.70078125, | |
| "mean_terminated_completion_length": 237.66231460571288, | |
| "min_completion_length": 106.3, | |
| "min_terminated_completion_length": 106.3, | |
| "num_tokens": 21958989.0, | |
| "reward": 0.6402581855654716, | |
| "reward_std": 0.23691350370645523, | |
| "rewards/format_reward/mean": 0.98125, | |
| "rewards/format_reward/std": 0.11141463369131088, | |
| "rewards/qatch_metrics/mean": 0.5797614596784115, | |
| "rewards/qatch_metrics/std": 0.4121564343571663, | |
| "rewards/tag_count_reward/mean": 0.98671875, | |
| "rewards/tag_count_reward/std": 0.07204583417624236, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.255619215513442, | |
| "grad_norm": 0.5648264834394017, | |
| "kl": 0.023236083984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0306, | |
| "max_completion_length": 669.65, | |
| "max_terminated_completion_length": 490.85, | |
| "mean_completion_length": 244.3, | |
| "mean_terminated_completion_length": 241.30169982910155, | |
| "min_completion_length": 88.45, | |
| "min_terminated_completion_length": 88.45, | |
| "num_tokens": 22715181.0, | |
| "reward": 0.6530531153082848, | |
| "reward_std": 0.17919475596863776, | |
| "rewards/format_reward/mean": 0.9765625, | |
| "rewards/format_reward/std": 0.12738077864050865, | |
| "rewards/qatch_metrics/mean": 0.5954002693295479, | |
| "rewards/qatch_metrics/std": 0.3696783661842346, | |
| "rewards/tag_count_reward/mean": 0.9861328125, | |
| "rewards/tag_count_reward/std": 0.06725587993860245, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00234375, | |
| "epoch": 0.26443367122080214, | |
| "grad_norm": 0.5988459814152964, | |
| "kl": 0.024114990234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0259, | |
| "max_completion_length": 1085.25, | |
| "max_terminated_completion_length": 543.2, | |
| "mean_completion_length": 265.02890625, | |
| "mean_terminated_completion_length": 256.03471221923826, | |
| "min_completion_length": 92.45, | |
| "min_terminated_completion_length": 92.45, | |
| "num_tokens": 23532306.0, | |
| "reward": 0.6515118405222893, | |
| "reward_std": 0.20926398932933807, | |
| "rewards/format_reward/mean": 0.97734375, | |
| "rewards/format_reward/std": 0.13359498009085655, | |
| "rewards/qatch_metrics/mean": 0.5935869842767716, | |
| "rewards/qatch_metrics/std": 0.39746817499399184, | |
| "rewards/tag_count_reward/mean": 0.9845703125, | |
| "rewards/tag_count_reward/std": 0.07609451431781053, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.2732481269281622, | |
| "grad_norm": 0.6981556644172091, | |
| "kl": 0.0252197265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0164, | |
| "max_completion_length": 873.35, | |
| "max_terminated_completion_length": 518.6, | |
| "mean_completion_length": 264.40546875, | |
| "mean_terminated_completion_length": 258.4989471435547, | |
| "min_completion_length": 104.0, | |
| "min_terminated_completion_length": 104.0, | |
| "num_tokens": 24348921.0, | |
| "reward": 0.6014397010207176, | |
| "reward_std": 0.2217434547841549, | |
| "rewards/format_reward/mean": 0.96640625, | |
| "rewards/format_reward/std": 0.15999660789966583, | |
| "rewards/qatch_metrics/mean": 0.5365627646446228, | |
| "rewards/qatch_metrics/std": 0.38931548669934274, | |
| "rewards/tag_count_reward/mean": 0.9744140625, | |
| "rewards/tag_count_reward/std": 0.10206393301486968, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.28206258263552225, | |
| "grad_norm": 0.5347628975159343, | |
| "kl": 0.0234619140625, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0042, | |
| "max_completion_length": 503.3, | |
| "max_terminated_completion_length": 503.3, | |
| "mean_completion_length": 259.58828125, | |
| "mean_terminated_completion_length": 259.58828125, | |
| "min_completion_length": 109.4, | |
| "min_terminated_completion_length": 109.4, | |
| "num_tokens": 25159306.0, | |
| "reward": 0.6513503938913345, | |
| "reward_std": 0.1974081691354513, | |
| "rewards/format_reward/mean": 0.98046875, | |
| "rewards/format_reward/std": 0.12139622867107391, | |
| "rewards/qatch_metrics/mean": 0.5932362079620361, | |
| "rewards/qatch_metrics/std": 0.37430901676416395, | |
| "rewards/tag_count_reward/mean": 0.9810546875, | |
| "rewards/tag_count_reward/std": 0.08590763248503208, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.2908770383428823, | |
| "grad_norm": 0.5447821275980702, | |
| "kl": 0.02672119140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0203, | |
| "max_completion_length": 666.15, | |
| "max_terminated_completion_length": 491.35, | |
| "mean_completion_length": 269.428125, | |
| "mean_terminated_completion_length": 266.4439239501953, | |
| "min_completion_length": 111.55, | |
| "min_terminated_completion_length": 111.55, | |
| "num_tokens": 25950894.0, | |
| "reward": 0.684382463991642, | |
| "reward_std": 0.21234923861920835, | |
| "rewards/format_reward/mean": 0.97109375, | |
| "rewards/format_reward/std": 0.12414052337408066, | |
| "rewards/qatch_metrics/mean": 0.6333382874727249, | |
| "rewards/qatch_metrics/std": 0.40051559060812, | |
| "rewards/tag_count_reward/mean": 0.9787109375, | |
| "rewards/tag_count_reward/std": 0.09124095235019922, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.2996914940502424, | |
| "grad_norm": 0.5525215823918918, | |
| "kl": 0.025628662109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0261, | |
| "max_completion_length": 894.35, | |
| "max_terminated_completion_length": 531.0, | |
| "mean_completion_length": 275.015625, | |
| "mean_terminated_completion_length": 269.0089302062988, | |
| "min_completion_length": 113.2, | |
| "min_terminated_completion_length": 113.2, | |
| "num_tokens": 26805554.0, | |
| "reward": 0.6457803517580032, | |
| "reward_std": 0.2149766854941845, | |
| "rewards/format_reward/mean": 0.96328125, | |
| "rewards/format_reward/std": 0.16744527816772461, | |
| "rewards/qatch_metrics/mean": 0.5888776123523712, | |
| "rewards/qatch_metrics/std": 0.38463765680789946, | |
| "rewards/tag_count_reward/mean": 0.978125, | |
| "rewards/tag_count_reward/std": 0.09678596928715706, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.3085059497576025, | |
| "grad_norm": 0.5616464568383206, | |
| "kl": 0.0274169921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0308, | |
| "max_completion_length": 682.7, | |
| "max_terminated_completion_length": 488.6, | |
| "mean_completion_length": 251.5890625, | |
| "mean_terminated_completion_length": 248.31122436523438, | |
| "min_completion_length": 109.7, | |
| "min_terminated_completion_length": 109.7, | |
| "num_tokens": 27594084.0, | |
| "reward": 0.6236591964960099, | |
| "reward_std": 0.18435912374407054, | |
| "rewards/format_reward/mean": 0.98125, | |
| "rewards/format_reward/std": 0.10607657507061959, | |
| "rewards/qatch_metrics/mean": 0.5601757816970349, | |
| "rewards/qatch_metrics/std": 0.39382868334650994, | |
| "rewards/tag_count_reward/mean": 0.9876953125, | |
| "rewards/tag_count_reward/std": 0.06476121675223112, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.31732040546496254, | |
| "grad_norm": 0.552375525389694, | |
| "kl": 0.02796630859375, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0025, | |
| "max_completion_length": 511.4, | |
| "max_terminated_completion_length": 511.4, | |
| "mean_completion_length": 241.58046875, | |
| "mean_terminated_completion_length": 241.58046875, | |
| "min_completion_length": 102.85, | |
| "min_terminated_completion_length": 102.85, | |
| "num_tokens": 28374923.0, | |
| "reward": 0.6719952240586281, | |
| "reward_std": 0.19265095554292203, | |
| "rewards/format_reward/mean": 0.98203125, | |
| "rewards/format_reward/std": 0.10228126645088195, | |
| "rewards/qatch_metrics/mean": 0.6168349057435989, | |
| "rewards/qatch_metrics/std": 0.38533141911029817, | |
| "rewards/tag_count_reward/mean": 0.9896484375, | |
| "rewards/tag_count_reward/std": 0.06360597647726536, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00234375, | |
| "epoch": 0.3261348611723226, | |
| "grad_norm": 0.5920475807376582, | |
| "kl": 0.02803955078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0524, | |
| "max_completion_length": 1080.8, | |
| "max_terminated_completion_length": 536.85, | |
| "mean_completion_length": 257.246875, | |
| "mean_terminated_completion_length": 248.25642623901368, | |
| "min_completion_length": 100.1, | |
| "min_terminated_completion_length": 100.1, | |
| "num_tokens": 29166343.0, | |
| "reward": 0.6265309870243072, | |
| "reward_std": 0.1748633362352848, | |
| "rewards/format_reward/mean": 0.98203125, | |
| "rewards/format_reward/std": 0.09919186681509018, | |
| "rewards/qatch_metrics/mean": 0.5634854272007942, | |
| "rewards/qatch_metrics/std": 0.38963339626789095, | |
| "rewards/tag_count_reward/mean": 0.9873046875, | |
| "rewards/tag_count_reward/std": 0.05825631488114595, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.33494931687968266, | |
| "grad_norm": 0.4498383537736612, | |
| "kl": 0.0280517578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0178, | |
| "max_completion_length": 646.2, | |
| "max_terminated_completion_length": 464.2, | |
| "mean_completion_length": 236.459375, | |
| "mean_terminated_completion_length": 230.44894104003907, | |
| "min_completion_length": 103.75, | |
| "min_terminated_completion_length": 103.75, | |
| "num_tokens": 29917475.0, | |
| "reward": 0.6826104655861854, | |
| "reward_std": 0.15590712875127793, | |
| "rewards/format_reward/mean": 0.98046875, | |
| "rewards/format_reward/std": 0.10486338511109353, | |
| "rewards/qatch_metrics/mean": 0.6296221412718296, | |
| "rewards/qatch_metrics/std": 0.3715696230530739, | |
| "rewards/tag_count_reward/mean": 0.9876953125, | |
| "rewards/tag_count_reward/std": 0.05578752104192972, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.34376377258704277, | |
| "grad_norm": 0.5792251614999063, | |
| "kl": 0.03179931640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0323, | |
| "max_completion_length": 655.9, | |
| "max_terminated_completion_length": 471.25, | |
| "mean_completion_length": 232.86328125, | |
| "mean_terminated_completion_length": 229.77679901123048, | |
| "min_completion_length": 97.25, | |
| "min_terminated_completion_length": 97.25, | |
| "num_tokens": 30667060.0, | |
| "reward": 0.6336304128170014, | |
| "reward_std": 0.19524292927235365, | |
| "rewards/format_reward/mean": 0.98046875, | |
| "rewards/format_reward/std": 0.12639724016189574, | |
| "rewards/qatch_metrics/mean": 0.5721593797206879, | |
| "rewards/qatch_metrics/std": 0.3995146706700325, | |
| "rewards/tag_count_reward/mean": 0.9849609375, | |
| "rewards/tag_count_reward/std": 0.07332022842019796, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.35257822829440283, | |
| "grad_norm": 0.4997911084130837, | |
| "kl": 0.026470947265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.009, | |
| "max_completion_length": 668.7, | |
| "max_terminated_completion_length": 488.95, | |
| "mean_completion_length": 263.603125, | |
| "mean_terminated_completion_length": 260.62204895019534, | |
| "min_completion_length": 113.3, | |
| "min_terminated_completion_length": 113.3, | |
| "num_tokens": 31483624.0, | |
| "reward": 0.6400622457265854, | |
| "reward_std": 0.1867681361734867, | |
| "rewards/format_reward/mean": 0.97421875, | |
| "rewards/format_reward/std": 0.1339642383158207, | |
| "rewards/qatch_metrics/mean": 0.5806224085390568, | |
| "rewards/qatch_metrics/std": 0.38299966901540755, | |
| "rewards/tag_count_reward/mean": 0.9822265625, | |
| "rewards/tag_count_reward/std": 0.08076513186097145, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.3613926840017629, | |
| "grad_norm": 0.4955421684375313, | |
| "kl": 0.028253173828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0173, | |
| "max_completion_length": 719.35, | |
| "max_terminated_completion_length": 546.1, | |
| "mean_completion_length": 294.48984375, | |
| "mean_terminated_completion_length": 291.5403030395508, | |
| "min_completion_length": 120.0, | |
| "min_terminated_completion_length": 120.0, | |
| "num_tokens": 32345643.0, | |
| "reward": 0.6172579132020474, | |
| "reward_std": 0.1661355197429657, | |
| "rewards/format_reward/mean": 0.95703125, | |
| "rewards/format_reward/std": 0.1844623327255249, | |
| "rewards/qatch_metrics/mean": 0.5562294371426105, | |
| "rewards/qatch_metrics/std": 0.3714133970439434, | |
| "rewards/tag_count_reward/mean": 0.9751953125, | |
| "rewards/tag_count_reward/std": 0.1012207405641675, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.37020713970912295, | |
| "grad_norm": 0.4467886577824779, | |
| "kl": 0.029364013671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.013, | |
| "max_completion_length": 714.95, | |
| "max_terminated_completion_length": 533.9, | |
| "mean_completion_length": 305.26953125, | |
| "mean_terminated_completion_length": 302.318701171875, | |
| "min_completion_length": 134.4, | |
| "min_terminated_completion_length": 134.4, | |
| "num_tokens": 33173172.0, | |
| "reward": 0.71367447078228, | |
| "reward_std": 0.2129422415047884, | |
| "rewards/format_reward/mean": 0.95625, | |
| "rewards/format_reward/std": 0.1810019753873348, | |
| "rewards/qatch_metrics/mean": 0.6697640687227249, | |
| "rewards/qatch_metrics/std": 0.3772424139082432, | |
| "rewards/tag_count_reward/mean": 0.975, | |
| "rewards/tag_count_reward/std": 0.09872399028390647, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00234375, | |
| "epoch": 0.37902159541648306, | |
| "grad_norm": 0.5441103528864044, | |
| "kl": 0.030072021484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.031, | |
| "max_completion_length": 884.15, | |
| "max_terminated_completion_length": 532.45, | |
| "mean_completion_length": 313.69296875, | |
| "mean_terminated_completion_length": 304.84855499267576, | |
| "min_completion_length": 132.25, | |
| "min_terminated_completion_length": 132.25, | |
| "num_tokens": 34052699.0, | |
| "reward": 0.6677688866853714, | |
| "reward_std": 0.20724300742149354, | |
| "rewards/format_reward/mean": 0.96328125, | |
| "rewards/format_reward/std": 0.1826186627149582, | |
| "rewards/qatch_metrics/mean": 0.6148039117455483, | |
| "rewards/qatch_metrics/std": 0.40869270265102386, | |
| "rewards/tag_count_reward/mean": 0.9771484375, | |
| "rewards/tag_count_reward/std": 0.09587243013083935, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.3878360511238431, | |
| "grad_norm": 0.5522659991373123, | |
| "kl": 0.032330322265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0151, | |
| "max_completion_length": 533.0, | |
| "max_terminated_completion_length": 533.0, | |
| "mean_completion_length": 288.96640625, | |
| "mean_terminated_completion_length": 288.96640625, | |
| "min_completion_length": 124.65, | |
| "min_terminated_completion_length": 124.65, | |
| "num_tokens": 34865520.0, | |
| "reward": 0.6856773257255554, | |
| "reward_std": 0.18784409649670125, | |
| "rewards/format_reward/mean": 0.97578125, | |
| "rewards/format_reward/std": 0.12870651334524155, | |
| "rewards/qatch_metrics/mean": 0.6343101695179939, | |
| "rewards/qatch_metrics/std": 0.3894802153110504, | |
| "rewards/tag_count_reward/mean": 0.9787109375, | |
| "rewards/tag_count_reward/std": 0.09142168313264847, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.3966505068312032, | |
| "grad_norm": 0.6080467814766749, | |
| "kl": 0.03060302734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0094, | |
| "max_completion_length": 698.35, | |
| "max_terminated_completion_length": 698.35, | |
| "mean_completion_length": 268.003125, | |
| "mean_terminated_completion_length": 268.003125, | |
| "min_completion_length": 100.55, | |
| "min_terminated_completion_length": 100.55, | |
| "num_tokens": 35709076.0, | |
| "reward": 0.6644678235054016, | |
| "reward_std": 0.1630731988698244, | |
| "rewards/format_reward/mean": 0.98203125, | |
| "rewards/format_reward/std": 0.10826152041554452, | |
| "rewards/qatch_metrics/mean": 0.6083812549710274, | |
| "rewards/qatch_metrics/std": 0.3846415340900421, | |
| "rewards/tag_count_reward/mean": 0.9828125, | |
| "rewards/tag_count_reward/std": 0.0803611170500517, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.40546496253856323, | |
| "grad_norm": 0.49345893126772594, | |
| "kl": 0.032147216796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0106, | |
| "max_completion_length": 615.4, | |
| "max_terminated_completion_length": 615.4, | |
| "mean_completion_length": 263.775, | |
| "mean_terminated_completion_length": 263.775, | |
| "min_completion_length": 117.05, | |
| "min_terminated_completion_length": 117.05, | |
| "num_tokens": 36508644.0, | |
| "reward": 0.6932666331529618, | |
| "reward_std": 0.19706026688218117, | |
| "rewards/format_reward/mean": 0.97734375, | |
| "rewards/format_reward/std": 0.1236697033047676, | |
| "rewards/qatch_metrics/mean": 0.6426643326878547, | |
| "rewards/qatch_metrics/std": 0.39514810144901275, | |
| "rewards/tag_count_reward/mean": 0.9853515625, | |
| "rewards/tag_count_reward/std": 0.08155724368989467, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.4142794182459233, | |
| "grad_norm": 0.5469748664725941, | |
| "kl": 0.03118896484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0263, | |
| "max_completion_length": 680.25, | |
| "max_terminated_completion_length": 496.1, | |
| "mean_completion_length": 261.68984375, | |
| "mean_terminated_completion_length": 258.70228271484376, | |
| "min_completion_length": 111.65, | |
| "min_terminated_completion_length": 111.65, | |
| "num_tokens": 37294087.0, | |
| "reward": 0.6919339522719383, | |
| "reward_std": 0.16458683405071498, | |
| "rewards/format_reward/mean": 0.975, | |
| "rewards/format_reward/std": 0.13364771082997323, | |
| "rewards/qatch_metrics/mean": 0.6413492292165757, | |
| "rewards/qatch_metrics/std": 0.3512177594006062, | |
| "rewards/tag_count_reward/mean": 0.9857421875, | |
| "rewards/tag_count_reward/std": 0.07062838673591613, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.4230938739532834, | |
| "grad_norm": 0.5424181043213094, | |
| "kl": 0.032281494140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0249, | |
| "max_completion_length": 497.95, | |
| "max_terminated_completion_length": 497.95, | |
| "mean_completion_length": 265.54453125, | |
| "mean_terminated_completion_length": 265.54453125, | |
| "min_completion_length": 115.25, | |
| "min_terminated_completion_length": 115.25, | |
| "num_tokens": 792761.0, | |
| "reward": 0.6116568207740783, | |
| "reward_std": 0.1871432088315487, | |
| "rewards/format_reward/mean": 0.98046875, | |
| "rewards/format_reward/std": 0.11176861301064492, | |
| "rewards/qatch_metrics/mean": 0.5462046906352043, | |
| "rewards/qatch_metrics/std": 0.39505585879087446, | |
| "rewards/tag_count_reward/mean": 0.98671875, | |
| "rewards/tag_count_reward/std": 0.07090304121375084, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.43190832966064346, | |
| "grad_norm": 0.5103775445408127, | |
| "kl": 0.03380126953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0064, | |
| "max_completion_length": 495.3, | |
| "max_terminated_completion_length": 495.3, | |
| "mean_completion_length": 257.19296875, | |
| "mean_terminated_completion_length": 257.19296875, | |
| "min_completion_length": 105.9, | |
| "min_terminated_completion_length": 105.9, | |
| "num_tokens": 1582672.0, | |
| "reward": 0.6935325592756272, | |
| "reward_std": 0.15109073698986322, | |
| "rewards/format_reward/mean": 0.9890625, | |
| "rewards/format_reward/std": 0.07193891182541848, | |
| "rewards/qatch_metrics/mean": 0.6413802206516266, | |
| "rewards/qatch_metrics/std": 0.3630501888692379, | |
| "rewards/tag_count_reward/mean": 0.9890625, | |
| "rewards/tag_count_reward/std": 0.058686737157404426, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00234375, | |
| "epoch": 0.4407227853680035, | |
| "grad_norm": 0.5482628632967557, | |
| "kl": 0.033172607421875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0406, | |
| "max_completion_length": 847.65, | |
| "max_terminated_completion_length": 495.4, | |
| "mean_completion_length": 254.41015625, | |
| "mean_terminated_completion_length": 245.46627502441407, | |
| "min_completion_length": 105.9, | |
| "min_terminated_completion_length": 105.9, | |
| "num_tokens": 2383645.0, | |
| "reward": 0.6978308916091919, | |
| "reward_std": 0.18419512659311293, | |
| "rewards/format_reward/mean": 0.99375, | |
| "rewards/format_reward/std": 0.04190210178494454, | |
| "rewards/qatch_metrics/mean": 0.6456213593482971, | |
| "rewards/qatch_metrics/std": 0.37477899715304375, | |
| "rewards/tag_count_reward/mean": 0.9935546875, | |
| "rewards/tag_count_reward/std": 0.0417668029665947, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.4495372410753636, | |
| "grad_norm": 0.5060506817042096, | |
| "kl": 0.03634033203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0038, | |
| "max_completion_length": 568.35, | |
| "max_terminated_completion_length": 568.35, | |
| "mean_completion_length": 233.55625, | |
| "mean_terminated_completion_length": 233.55625, | |
| "min_completion_length": 98.05, | |
| "min_terminated_completion_length": 98.05, | |
| "num_tokens": 3177781.0, | |
| "reward": 0.6797079920768738, | |
| "reward_std": 0.16788024138659238, | |
| "rewards/format_reward/mean": 0.99140625, | |
| "rewards/format_reward/std": 0.05755521506071091, | |
| "rewards/qatch_metrics/mean": 0.6245875038206578, | |
| "rewards/qatch_metrics/std": 0.39114040434360503, | |
| "rewards/tag_count_reward/mean": 0.993359375, | |
| "rewards/tag_count_reward/std": 0.04090307988226414, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.4583516967827237, | |
| "grad_norm": 0.5565315128524225, | |
| "kl": 0.03184814453125, | |
| "learning_rate": 1e-06, | |
| "loss": -0.0016, | |
| "max_completion_length": 646.05, | |
| "max_terminated_completion_length": 467.4, | |
| "mean_completion_length": 248.06875, | |
| "mean_terminated_completion_length": 244.995263671875, | |
| "min_completion_length": 104.9, | |
| "min_terminated_completion_length": 104.9, | |
| "num_tokens": 3942269.0, | |
| "reward": 0.6090764939785004, | |
| "reward_std": 0.17434044806286692, | |
| "rewards/format_reward/mean": 0.9859375, | |
| "rewards/format_reward/std": 0.09320731684565545, | |
| "rewards/qatch_metrics/mean": 0.542192454636097, | |
| "rewards/qatch_metrics/std": 0.40765938013792036, | |
| "rewards/tag_count_reward/mean": 0.9923828125, | |
| "rewards/tag_count_reward/std": 0.045861832052469256, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00234375, | |
| "epoch": 0.46716615249008375, | |
| "grad_norm": 0.5159648538503434, | |
| "kl": 0.031085205078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0524, | |
| "max_completion_length": 1012.05, | |
| "max_terminated_completion_length": 476.0, | |
| "mean_completion_length": 255.10625, | |
| "mean_terminated_completion_length": 246.1957977294922, | |
| "min_completion_length": 108.15, | |
| "min_terminated_completion_length": 108.15, | |
| "num_tokens": 4732997.0, | |
| "reward": 0.7309203892946243, | |
| "reward_std": 0.16675787828862668, | |
| "rewards/format_reward/mean": 0.97734375, | |
| "rewards/format_reward/std": 0.12057281658053398, | |
| "rewards/qatch_metrics/mean": 0.6867330849170685, | |
| "rewards/qatch_metrics/std": 0.3602643422782421, | |
| "rewards/tag_count_reward/mean": 0.9892578125, | |
| "rewards/tag_count_reward/std": 0.06107194591313601, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.4759806081974438, | |
| "grad_norm": 0.5501400394102324, | |
| "kl": 0.0342529296875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0197, | |
| "max_completion_length": 533.7, | |
| "max_terminated_completion_length": 533.7, | |
| "mean_completion_length": 267.54375, | |
| "mean_terminated_completion_length": 267.54375, | |
| "min_completion_length": 105.2, | |
| "min_terminated_completion_length": 105.2, | |
| "num_tokens": 5542733.0, | |
| "reward": 0.6369156464934349, | |
| "reward_std": 0.19418321922421455, | |
| "rewards/format_reward/mean": 0.96484375, | |
| "rewards/format_reward/std": 0.1736892782151699, | |
| "rewards/qatch_metrics/mean": 0.5780234441161156, | |
| "rewards/qatch_metrics/std": 0.36038266196846963, | |
| "rewards/tag_count_reward/mean": 0.9822265625, | |
| "rewards/tag_count_reward/std": 0.08473732396960258, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.48479506390480387, | |
| "grad_norm": 0.5451493093840466, | |
| "kl": 0.030584716796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0236, | |
| "max_completion_length": 680.6, | |
| "max_terminated_completion_length": 507.15, | |
| "mean_completion_length": 260.48515625, | |
| "mean_terminated_completion_length": 257.5203262329102, | |
| "min_completion_length": 106.3, | |
| "min_terminated_completion_length": 106.3, | |
| "num_tokens": 6370458.0, | |
| "reward": 0.717712578177452, | |
| "reward_std": 0.19914851561188698, | |
| "rewards/format_reward/mean": 0.98359375, | |
| "rewards/format_reward/std": 0.10886043012142181, | |
| "rewards/qatch_metrics/mean": 0.6704362109303474, | |
| "rewards/qatch_metrics/std": 0.3611669532954693, | |
| "rewards/tag_count_reward/mean": 0.9896484375, | |
| "rewards/tag_count_reward/std": 0.05439381040632725, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.4936095196121639, | |
| "grad_norm": 0.4560522242084336, | |
| "kl": 0.029486083984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0116, | |
| "max_completion_length": 512.35, | |
| "max_terminated_completion_length": 512.35, | |
| "mean_completion_length": 243.9984375, | |
| "mean_terminated_completion_length": 243.9984375, | |
| "min_completion_length": 100.9, | |
| "min_terminated_completion_length": 100.9, | |
| "num_tokens": 7158936.0, | |
| "reward": 0.6506539478898048, | |
| "reward_std": 0.17697120299562813, | |
| "rewards/format_reward/mean": 0.9921875, | |
| "rewards/format_reward/std": 0.0625, | |
| "rewards/qatch_metrics/mean": 0.5901994869112969, | |
| "rewards/qatch_metrics/std": 0.3878361865878105, | |
| "rewards/tag_count_reward/mean": 0.9953125, | |
| "rewards/tag_count_reward/std": 0.02853791303932667, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.502423975319524, | |
| "grad_norm": 0.55666963865944, | |
| "kl": 0.031878662109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0491, | |
| "max_completion_length": 686.95, | |
| "max_terminated_completion_length": 520.7, | |
| "mean_completion_length": 249.55546875, | |
| "mean_terminated_completion_length": 243.56701049804687, | |
| "min_completion_length": 104.1, | |
| "min_terminated_completion_length": 104.1, | |
| "num_tokens": 7971311.0, | |
| "reward": 0.6463295266032218, | |
| "reward_std": 0.1542746689170599, | |
| "rewards/format_reward/mean": 0.98828125, | |
| "rewards/format_reward/std": 0.0722928911447525, | |
| "rewards/qatch_metrics/mean": 0.5858127683401108, | |
| "rewards/qatch_metrics/std": 0.35884510800242425, | |
| "rewards/tag_count_reward/mean": 0.9912109375, | |
| "rewards/tag_count_reward/std": 0.04745456837117672, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00390625, | |
| "epoch": 0.511238431026884, | |
| "grad_norm": 0.5772287574660636, | |
| "kl": 0.033306884765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0213, | |
| "max_completion_length": 647.25, | |
| "max_terminated_completion_length": 468.65, | |
| "mean_completion_length": 243.05390625, | |
| "mean_terminated_completion_length": 227.93539428710938, | |
| "min_completion_length": 103.45, | |
| "min_terminated_completion_length": 103.45, | |
| "num_tokens": 8756964.0, | |
| "reward": 0.6426644682884216, | |
| "reward_std": 0.1361727599054575, | |
| "rewards/format_reward/mean": 0.97578125, | |
| "rewards/format_reward/std": 0.13138211965560914, | |
| "rewards/qatch_metrics/mean": 0.5831093832850456, | |
| "rewards/qatch_metrics/std": 0.381498122215271, | |
| "rewards/tag_count_reward/mean": 0.9888671875, | |
| "rewards/tag_count_reward/std": 0.06255398578941822, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.5200528867342442, | |
| "grad_norm": 0.5816908579360218, | |
| "kl": 0.03211669921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.007, | |
| "max_completion_length": 531.75, | |
| "max_terminated_completion_length": 531.75, | |
| "mean_completion_length": 236.53359375, | |
| "mean_terminated_completion_length": 236.53359375, | |
| "min_completion_length": 101.0, | |
| "min_terminated_completion_length": 101.0, | |
| "num_tokens": 9522543.0, | |
| "reward": 0.6657721042633057, | |
| "reward_std": 0.17067355960607528, | |
| "rewards/format_reward/mean": 0.96953125, | |
| "rewards/format_reward/std": 0.14817975088953972, | |
| "rewards/qatch_metrics/mean": 0.6114552110433579, | |
| "rewards/qatch_metrics/std": 0.370364161580801, | |
| "rewards/tag_count_reward/mean": 0.981640625, | |
| "rewards/tag_count_reward/std": 0.08591084536164999, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.5288673424416043, | |
| "grad_norm": 0.519511253922303, | |
| "kl": 0.03341064453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.016, | |
| "max_completion_length": 524.15, | |
| "max_terminated_completion_length": 524.15, | |
| "mean_completion_length": 255.33984375, | |
| "mean_terminated_completion_length": 255.33984375, | |
| "min_completion_length": 103.0, | |
| "min_terminated_completion_length": 103.0, | |
| "num_tokens": 10337346.0, | |
| "reward": 0.6941297054290771, | |
| "reward_std": 0.16702273711562157, | |
| "rewards/format_reward/mean": 0.9765625, | |
| "rewards/format_reward/std": 0.13208412900567054, | |
| "rewards/qatch_metrics/mean": 0.6437830820679664, | |
| "rewards/qatch_metrics/std": 0.34450062923133373, | |
| "rewards/tag_count_reward/mean": 0.98515625, | |
| "rewards/tag_count_reward/std": 0.07277002464979887, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.5376817981489643, | |
| "grad_norm": 0.5293306841918778, | |
| "kl": 0.032666015625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0315, | |
| "max_completion_length": 892.3, | |
| "max_terminated_completion_length": 528.0, | |
| "mean_completion_length": 264.1609375, | |
| "mean_terminated_completion_length": 258.1387908935547, | |
| "min_completion_length": 103.95, | |
| "min_terminated_completion_length": 103.95, | |
| "num_tokens": 11150320.0, | |
| "reward": 0.716396963596344, | |
| "reward_std": 0.15938269887119533, | |
| "rewards/format_reward/mean": 0.98046875, | |
| "rewards/format_reward/std": 0.12139622867107391, | |
| "rewards/qatch_metrics/mean": 0.6693250104784966, | |
| "rewards/qatch_metrics/std": 0.3478319551795721, | |
| "rewards/tag_count_reward/mean": 0.9884765625, | |
| "rewards/tag_count_reward/std": 0.06362733300775289, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.5464962538563244, | |
| "grad_norm": 0.5963776750067843, | |
| "kl": 0.034674072265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0422, | |
| "max_completion_length": 669.5, | |
| "max_terminated_completion_length": 485.65, | |
| "mean_completion_length": 260.6171875, | |
| "mean_terminated_completion_length": 257.4505470275879, | |
| "min_completion_length": 103.8, | |
| "min_terminated_completion_length": 103.8, | |
| "num_tokens": 11938982.0, | |
| "reward": 0.6859385922551156, | |
| "reward_std": 0.1521838934160769, | |
| "rewards/format_reward/mean": 0.9796875, | |
| "rewards/format_reward/std": 0.11145043224096299, | |
| "rewards/qatch_metrics/mean": 0.6338018253445625, | |
| "rewards/qatch_metrics/std": 0.3684115245938301, | |
| "rewards/tag_count_reward/mean": 0.984765625, | |
| "rewards/tag_count_reward/std": 0.07420742474496364, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.5553107095636844, | |
| "grad_norm": 0.4748976849004984, | |
| "kl": 0.030694580078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0114, | |
| "max_completion_length": 537.15, | |
| "max_terminated_completion_length": 537.15, | |
| "mean_completion_length": 273.32578125, | |
| "mean_terminated_completion_length": 273.32578125, | |
| "min_completion_length": 113.35, | |
| "min_terminated_completion_length": 113.35, | |
| "num_tokens": 12774551.0, | |
| "reward": 0.7473421692848206, | |
| "reward_std": 0.1348694651562255, | |
| "rewards/format_reward/mean": 0.9828125, | |
| "rewards/format_reward/std": 0.11074412688612938, | |
| "rewards/qatch_metrics/mean": 0.705294543504715, | |
| "rewards/qatch_metrics/std": 0.33758219704031944, | |
| "rewards/tag_count_reward/mean": 0.9912109375, | |
| "rewards/tag_count_reward/std": 0.05070688333362341, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.5641251652710445, | |
| "grad_norm": 0.49823406293567807, | |
| "kl": 0.033697509765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0125, | |
| "max_completion_length": 527.75, | |
| "max_terminated_completion_length": 527.75, | |
| "mean_completion_length": 260.1609375, | |
| "mean_terminated_completion_length": 260.1609375, | |
| "min_completion_length": 104.05, | |
| "min_terminated_completion_length": 104.05, | |
| "num_tokens": 13567845.0, | |
| "reward": 0.7109958961606025, | |
| "reward_std": 0.1617593862116337, | |
| "rewards/format_reward/mean": 0.978125, | |
| "rewards/format_reward/std": 0.12988390475511552, | |
| "rewards/qatch_metrics/mean": 0.6630856856703758, | |
| "rewards/qatch_metrics/std": 0.3610327780246735, | |
| "rewards/tag_count_reward/mean": 0.9912109375, | |
| "rewards/tag_count_reward/std": 0.05200497191399336, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.5729396209784046, | |
| "grad_norm": 0.4605948208202922, | |
| "kl": 0.033721923828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0166, | |
| "max_completion_length": 684.95, | |
| "max_terminated_completion_length": 502.15, | |
| "mean_completion_length": 247.090625, | |
| "mean_terminated_completion_length": 244.0944076538086, | |
| "min_completion_length": 96.65, | |
| "min_terminated_completion_length": 96.65, | |
| "num_tokens": 14325321.0, | |
| "reward": 0.7332586348056793, | |
| "reward_std": 0.17256649993360043, | |
| "rewards/format_reward/mean": 0.98203125, | |
| "rewards/format_reward/std": 0.10236549973487855, | |
| "rewards/qatch_metrics/mean": 0.6886682316660881, | |
| "rewards/qatch_metrics/std": 0.35823953002691267, | |
| "rewards/tag_count_reward/mean": 0.99375, | |
| "rewards/tag_count_reward/std": 0.040812858007848264, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.5817540766857646, | |
| "grad_norm": 0.5300020018181278, | |
| "kl": 0.03382568359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0177, | |
| "max_completion_length": 867.25, | |
| "max_terminated_completion_length": 520.55, | |
| "mean_completion_length": 254.89765625, | |
| "mean_terminated_completion_length": 248.95018768310547, | |
| "min_completion_length": 102.75, | |
| "min_terminated_completion_length": 102.75, | |
| "num_tokens": 15125494.0, | |
| "reward": 0.7057395145297051, | |
| "reward_std": 0.1705713152885437, | |
| "rewards/format_reward/mean": 0.98984375, | |
| "rewards/format_reward/std": 0.07378681004047394, | |
| "rewards/qatch_metrics/mean": 0.6552817761898041, | |
| "rewards/qatch_metrics/std": 0.3287548400461674, | |
| "rewards/tag_count_reward/mean": 0.9953125, | |
| "rewards/tag_count_reward/std": 0.028935904055833815, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.5905685323931247, | |
| "grad_norm": 0.46021572550392736, | |
| "kl": 0.033184814453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0192, | |
| "max_completion_length": 519.15, | |
| "max_terminated_completion_length": 519.15, | |
| "mean_completion_length": 252.3640625, | |
| "mean_terminated_completion_length": 252.3640625, | |
| "min_completion_length": 101.6, | |
| "min_terminated_completion_length": 101.6, | |
| "num_tokens": 15916088.0, | |
| "reward": 0.7386901170015335, | |
| "reward_std": 0.1651175945997238, | |
| "rewards/format_reward/mean": 0.98359375, | |
| "rewards/format_reward/std": 0.10288766324520111, | |
| "rewards/qatch_metrics/mean": 0.6949088662862778, | |
| "rewards/qatch_metrics/std": 0.36692087799310685, | |
| "rewards/tag_count_reward/mean": 0.9931640625, | |
| "rewards/tag_count_reward/std": 0.0444426404312253, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.5993829881004848, | |
| "grad_norm": 0.459925420016195, | |
| "kl": 0.032891845703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0068, | |
| "max_completion_length": 555.45, | |
| "max_terminated_completion_length": 555.45, | |
| "mean_completion_length": 267.88046875, | |
| "mean_terminated_completion_length": 267.88046875, | |
| "min_completion_length": 103.95, | |
| "min_terminated_completion_length": 103.95, | |
| "num_tokens": 16722879.0, | |
| "reward": 0.7072908192873001, | |
| "reward_std": 0.15990890543907882, | |
| "rewards/format_reward/mean": 0.98984375, | |
| "rewards/format_reward/std": 0.06632362008094787, | |
| "rewards/qatch_metrics/mean": 0.657083860039711, | |
| "rewards/qatch_metrics/std": 0.3630808234214783, | |
| "rewards/tag_count_reward/mean": 0.995703125, | |
| "rewards/tag_count_reward/std": 0.02757673691958189, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.6081974438078449, | |
| "grad_norm": 0.48256766574422794, | |
| "kl": 0.034820556640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0251, | |
| "max_completion_length": 696.05, | |
| "max_terminated_completion_length": 519.45, | |
| "mean_completion_length": 276.95625, | |
| "mean_terminated_completion_length": 273.95636291503905, | |
| "min_completion_length": 102.4, | |
| "min_terminated_completion_length": 102.4, | |
| "num_tokens": 17569799.0, | |
| "reward": 0.7302132397890091, | |
| "reward_std": 0.1766110870987177, | |
| "rewards/format_reward/mean": 0.98828125, | |
| "rewards/format_reward/std": 0.07348556146025657, | |
| "rewards/qatch_metrics/mean": 0.6843041747808456, | |
| "rewards/qatch_metrics/std": 0.36742570996284485, | |
| "rewards/tag_count_reward/mean": 0.99453125, | |
| "rewards/tag_count_reward/std": 0.032534679397940636, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.617011899515205, | |
| "grad_norm": 0.5527734452877487, | |
| "kl": 0.039410400390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0106, | |
| "max_completion_length": 522.05, | |
| "max_terminated_completion_length": 522.05, | |
| "mean_completion_length": 276.26640625, | |
| "mean_terminated_completion_length": 276.26640625, | |
| "min_completion_length": 118.1, | |
| "min_terminated_completion_length": 118.1, | |
| "num_tokens": 18397500.0, | |
| "reward": 0.6712282940745353, | |
| "reward_std": 0.17349297013133763, | |
| "rewards/format_reward/mean": 0.98671875, | |
| "rewards/format_reward/std": 0.09068891182541847, | |
| "rewards/qatch_metrics/mean": 0.6150250181555748, | |
| "rewards/qatch_metrics/std": 0.3758671097457409, | |
| "rewards/tag_count_reward/mean": 0.995703125, | |
| "rewards/tag_count_reward/std": 0.028296593204140665, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.00078125, | |
| "epoch": 0.625826355222565, | |
| "grad_norm": 0.45652253231499157, | |
| "kl": 0.035089111328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0153, | |
| "max_completion_length": 719.95, | |
| "max_terminated_completion_length": 541.15, | |
| "mean_completion_length": 278.35546875, | |
| "mean_terminated_completion_length": 275.36917266845705, | |
| "min_completion_length": 113.25, | |
| "min_terminated_completion_length": 113.25, | |
| "num_tokens": 19219091.0, | |
| "reward": 0.686581015586853, | |
| "reward_std": 0.174876070022583, | |
| "rewards/format_reward/mean": 0.9796875, | |
| "rewards/format_reward/std": 0.11921128332614898, | |
| "rewards/qatch_metrics/mean": 0.6339487046003341, | |
| "rewards/qatch_metrics/std": 0.3851124107837677, | |
| "rewards/tag_count_reward/mean": 0.9951171875, | |
| "rewards/tag_count_reward/std": 0.03250717576593161, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0015625, | |
| "epoch": 0.6346408109299251, | |
| "grad_norm": 0.47122714595637016, | |
| "kl": 0.047320556640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0291, | |
| "max_completion_length": 691.6, | |
| "max_terminated_completion_length": 520.35, | |
| "mean_completion_length": 281.8140625, | |
| "mean_terminated_completion_length": 275.9359375, | |
| "min_completion_length": 113.25, | |
| "min_terminated_completion_length": 113.25, | |
| "num_tokens": 20058517.0, | |
| "reward": 0.6932022422552109, | |
| "reward_std": 0.17457041498273612, | |
| "rewards/format_reward/mean": 0.96484375, | |
| "rewards/format_reward/std": 0.17012277469038964, | |
| "rewards/qatch_metrics/mean": 0.6437604293227196, | |
| "rewards/qatch_metrics/std": 0.36496525853872297, | |
| "rewards/tag_count_reward/mean": 0.9904296875, | |
| "rewards/tag_count_reward/std": 0.04925162773579359, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.6434552666372851, | |
| "grad_norm": 0.5557256010853785, | |
| "kl": 0.040093994140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0053, | |
| "max_completion_length": 487.2, | |
| "max_terminated_completion_length": 487.2, | |
| "mean_completion_length": 255.43671875, | |
| "mean_terminated_completion_length": 255.43671875, | |
| "min_completion_length": 110.3, | |
| "min_terminated_completion_length": 110.3, | |
| "num_tokens": 20843476.0, | |
| "reward": 0.7185780197381973, | |
| "reward_std": 0.17451238669455052, | |
| "rewards/format_reward/mean": 0.975, | |
| "rewards/format_reward/std": 0.11770472824573516, | |
| "rewards/qatch_metrics/mean": 0.6723619893193244, | |
| "rewards/qatch_metrics/std": 0.3562425054609776, | |
| "rewards/tag_count_reward/mean": 0.99140625, | |
| "rewards/tag_count_reward/std": 0.04969721082597971, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.6522697223446452, | |
| "grad_norm": 0.5841098280523345, | |
| "kl": 0.042413330078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0175, | |
| "max_completion_length": 517.1, | |
| "max_terminated_completion_length": 517.1, | |
| "mean_completion_length": 255.48515625, | |
| "mean_terminated_completion_length": 255.48515625, | |
| "min_completion_length": 106.9, | |
| "min_terminated_completion_length": 106.9, | |
| "num_tokens": 21676065.0, | |
| "reward": 0.7083508610725403, | |
| "reward_std": 0.14969376297667622, | |
| "rewards/format_reward/mean": 0.984375, | |
| "rewards/format_reward/std": 0.1019757218658924, | |
| "rewards/qatch_metrics/mean": 0.6591236971318721, | |
| "rewards/qatch_metrics/std": 0.3382732715457678, | |
| "rewards/tag_count_reward/mean": 0.9931640625, | |
| "rewards/tag_count_reward/std": 0.03620915710926056, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "clipped_completions_ratio": 0.0, | |
| "epoch": 0.6610841780520053, | |
| "grad_norm": 0.5154906795060099, | |
| "kl": 0.03631591796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0101, | |
| "max_completion_length": 500.45, | |
| "max_terminated_completion_length": 500.45, | |
| "mean_completion_length": 254.51171875, | |
| "mean_terminated_completion_length": 254.51171875, | |
| "min_completion_length": 106.55, | |
| "min_terminated_completion_length": 106.55, | |
| "num_tokens": 22461200.0, | |
| "reward": 0.7427750110626221, | |
| "reward_std": 0.15802920872811227, | |
| "rewards/format_reward/mean": 0.98671875, | |
| "rewards/format_reward/std": 0.082591013610363, | |
| "rewards/qatch_metrics/mean": 0.699703136086464, | |
| "rewards/qatch_metrics/std": 0.3596019983291626, | |
| "rewards/tag_count_reward/mean": 0.987109375, | |
| "rewards/tag_count_reward/std": 0.06131851337850094, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 673.0, | |
| "completions/max_terminated_length": 673.0, | |
| "completions/mean_length": 257.328125, | |
| "completions/mean_terminated_length": 257.328125, | |
| "completions/min_length": 82.2, | |
| "completions/min_terminated_length": 82.2, | |
| "epoch": 0.6698986337593653, | |
| "grad_norm": 0.501113287540617, | |
| "kl": 0.050469970703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0149, | |
| "num_tokens": 803508.0, | |
| "reward": 0.690896725654602, | |
| "reward_std": 0.16887915432453154, | |
| "rewards/format_reward/mean": 0.9828125, | |
| "rewards/format_reward/std": 0.12410171926021576, | |
| "rewards/qatch_metrics/mean": 0.6392328143119812, | |
| "rewards/qatch_metrics/std": 0.408307409286499, | |
| "rewards/tag_count_reward/mean": 0.9853515625, | |
| "rewards/tag_count_reward/std": 0.07323447465896607, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0015625, | |
| "completions/max_length": 1315.8, | |
| "completions/max_terminated_length": 607.0, | |
| "completions/mean_length": 253.521875, | |
| "completions/mean_terminated_length": 247.52328491210938, | |
| "completions/min_length": 80.6, | |
| "completions/min_terminated_length": 80.6, | |
| "epoch": 0.6787130894667255, | |
| "grad_norm": 0.6031075403709034, | |
| "kl": 0.0374267578125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0216, | |
| "num_tokens": 1582128.0, | |
| "reward": 0.6904418587684631, | |
| "reward_std": 0.16239723265171052, | |
| "rewards/format_reward/mean": 0.97734375, | |
| "rewards/format_reward/std": 0.14716047197580337, | |
| "rewards/qatch_metrics/mean": 0.6393755316734314, | |
| "rewards/qatch_metrics/std": 0.4064714789390564, | |
| "rewards/tag_count_reward/mean": 0.984765625, | |
| "rewards/tag_count_reward/std": 0.08733545765280723, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 539.0, | |
| "completions/max_terminated_length": 539.0, | |
| "completions/mean_length": 238.40859375, | |
| "completions/mean_terminated_length": 238.40859375, | |
| "completions/min_length": 75.8, | |
| "completions/min_terminated_length": 75.8, | |
| "epoch": 0.6875275451740855, | |
| "grad_norm": 0.5192326605762402, | |
| "kl": 0.052252197265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0204, | |
| "num_tokens": 2378251.0, | |
| "reward": 0.7102061033248901, | |
| "reward_std": 0.1528529405593872, | |
| "rewards/format_reward/mean": 0.97890625, | |
| "rewards/format_reward/std": 0.14279676973819733, | |
| "rewards/qatch_metrics/mean": 0.66205313205719, | |
| "rewards/qatch_metrics/std": 0.394316303730011, | |
| "rewards/tag_count_reward/mean": 0.99140625, | |
| "rewards/tag_count_reward/std": 0.06560983434319496, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 652.0, | |
| "completions/max_terminated_length": 652.0, | |
| "completions/mean_length": 260.3125, | |
| "completions/mean_terminated_length": 260.3125, | |
| "completions/min_length": 75.4, | |
| "completions/min_terminated_length": 75.4, | |
| "epoch": 0.6963420008814456, | |
| "grad_norm": 0.5024199184925564, | |
| "kl": 0.035540771484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0027, | |
| "num_tokens": 3184155.0, | |
| "reward": 0.6859581351280213, | |
| "reward_std": 0.1660704255104065, | |
| "rewards/format_reward/mean": 0.98125, | |
| "rewards/format_reward/std": 0.13385934233665467, | |
| "rewards/qatch_metrics/mean": 0.6333078145980835, | |
| "rewards/qatch_metrics/std": 0.4020949721336365, | |
| "rewards/tag_count_reward/mean": 0.9904296875, | |
| "rewards/tag_count_reward/std": 0.07544671446084976, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00234375, | |
| "completions/max_length": 2044.0, | |
| "completions/max_terminated_length": 630.4, | |
| "completions/mean_length": 282.5609375, | |
| "completions/mean_terminated_length": 273.59022216796876, | |
| "completions/min_length": 90.8, | |
| "completions/min_terminated_length": 90.8, | |
| "epoch": 0.7051564565888057, | |
| "grad_norm": 0.467260325806434, | |
| "kl": 0.032830810546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0523, | |
| "num_tokens": 4040073.0, | |
| "reward": 0.7137078642845154, | |
| "reward_std": 0.17286253571510315, | |
| "rewards/format_reward/mean": 0.9828125, | |
| "rewards/format_reward/std": 0.12251157611608506, | |
| "rewards/qatch_metrics/mean": 0.6659085869789123, | |
| "rewards/qatch_metrics/std": 0.3838121175765991, | |
| "rewards/tag_count_reward/mean": 0.9880859375, | |
| "rewards/tag_count_reward/std": 0.07518481239676475, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 783.4, | |
| "completions/max_terminated_length": 783.4, | |
| "completions/mean_length": 286.99765625, | |
| "completions/mean_terminated_length": 286.99765625, | |
| "completions/min_length": 102.6, | |
| "completions/min_terminated_length": 102.6, | |
| "epoch": 0.7139709122961657, | |
| "grad_norm": 0.5299690487520671, | |
| "kl": 0.0345947265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0104, | |
| "num_tokens": 4871222.0, | |
| "reward": 0.7742552280426025, | |
| "reward_std": 0.14425914883613586, | |
| "rewards/format_reward/mean": 0.97578125, | |
| "rewards/format_reward/std": 0.15354832112789155, | |
| "rewards/qatch_metrics/mean": 0.7378646016120911, | |
| "rewards/qatch_metrics/std": 0.3577612638473511, | |
| "rewards/tag_count_reward/mean": 0.98984375, | |
| "rewards/tag_count_reward/std": 0.06915387809276581, | |
| "step": 405 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00078125, | |
| "completions/max_length": 1449.8, | |
| "completions/max_terminated_length": 881.4, | |
| "completions/mean_length": 308.171875, | |
| "completions/mean_terminated_length": 305.2107177734375, | |
| "completions/min_length": 77.6, | |
| "completions/min_terminated_length": 77.6, | |
| "epoch": 0.7227853680035258, | |
| "grad_norm": 0.4493483899621249, | |
| "kl": 0.03697509765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0386, | |
| "num_tokens": 5751266.0, | |
| "reward": 0.7124456286430358, | |
| "reward_std": 0.13775645643472673, | |
| "rewards/format_reward/mean": 0.96953125, | |
| "rewards/format_reward/std": 0.16755682677030564, | |
| "rewards/qatch_metrics/mean": 0.6660205960273743, | |
| "rewards/qatch_metrics/std": 0.3835669577121735, | |
| "rewards/tag_count_reward/mean": 0.9875, | |
| "rewards/tag_count_reward/std": 0.0721856091171503, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 628.2, | |
| "completions/max_terminated_length": 628.2, | |
| "completions/mean_length": 291.88671875, | |
| "completions/mean_terminated_length": 291.88671875, | |
| "completions/min_length": 98.8, | |
| "completions/min_terminated_length": 98.8, | |
| "epoch": 0.7315998237108858, | |
| "grad_norm": 0.5761979712500102, | |
| "kl": 0.0383544921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0179, | |
| "num_tokens": 6591857.0, | |
| "reward": 0.7244773507118225, | |
| "reward_std": 0.13447282165288926, | |
| "rewards/format_reward/mean": 0.97578125, | |
| "rewards/format_reward/std": 0.1513870522379875, | |
| "rewards/qatch_metrics/mean": 0.6792909026145935, | |
| "rewards/qatch_metrics/std": 0.3745004594326019, | |
| "rewards/tag_count_reward/mean": 0.9900390625, | |
| "rewards/tag_count_reward/std": 0.07874297201633454, | |
| "step": 415 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 689.8, | |
| "completions/max_terminated_length": 689.8, | |
| "completions/mean_length": 282.25078125, | |
| "completions/mean_terminated_length": 282.25078125, | |
| "completions/min_length": 84.4, | |
| "completions/min_terminated_length": 84.4, | |
| "epoch": 0.7404142794182459, | |
| "grad_norm": 0.5244162668782878, | |
| "kl": 0.046649169921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0109, | |
| "num_tokens": 7404210.0, | |
| "reward": 0.7365931272506714, | |
| "reward_std": 0.15788652896881103, | |
| "rewards/format_reward/mean": 0.9828125, | |
| "rewards/format_reward/std": 0.12439378350973129, | |
| "rewards/qatch_metrics/mean": 0.6925796866416931, | |
| "rewards/qatch_metrics/std": 0.3907664895057678, | |
| "rewards/tag_count_reward/mean": 0.9923828125, | |
| "rewards/tag_count_reward/std": 0.051664411649107934, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 539.6, | |
| "completions/max_terminated_length": 539.6, | |
| "completions/mean_length": 257.015625, | |
| "completions/mean_terminated_length": 257.015625, | |
| "completions/min_length": 71.2, | |
| "completions/min_terminated_length": 71.2, | |
| "epoch": 0.749228735125606, | |
| "grad_norm": 0.45715739561513447, | |
| "kl": 0.039117431640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0164, | |
| "num_tokens": 8209798.0, | |
| "reward": 0.7224585175514221, | |
| "reward_std": 0.14680615216493606, | |
| "rewards/format_reward/mean": 0.97578125, | |
| "rewards/format_reward/std": 0.13695741891860963, | |
| "rewards/qatch_metrics/mean": 0.6767549514770508, | |
| "rewards/qatch_metrics/std": 0.3877357721328735, | |
| "rewards/tag_count_reward/mean": 0.9927734375, | |
| "rewards/tag_count_reward/std": 0.05077721327543259, | |
| "step": 425 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 665.8, | |
| "completions/max_terminated_length": 665.8, | |
| "completions/mean_length": 262.3390625, | |
| "completions/mean_terminated_length": 262.3390625, | |
| "completions/min_length": 90.6, | |
| "completions/min_terminated_length": 90.6, | |
| "epoch": 0.7580431908329661, | |
| "grad_norm": 0.4871390662888905, | |
| "kl": 0.040716552734375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.012, | |
| "num_tokens": 9005368.0, | |
| "reward": 0.7663362741470336, | |
| "reward_std": 0.16361640095710756, | |
| "rewards/format_reward/mean": 0.98125, | |
| "rewards/format_reward/std": 0.13345934748649596, | |
| "rewards/qatch_metrics/mean": 0.7278703331947327, | |
| "rewards/qatch_metrics/std": 0.35586323142051696, | |
| "rewards/tag_count_reward/mean": 0.9904296875, | |
| "rewards/tag_count_reward/std": 0.06695948392152787, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 643.4, | |
| "completions/max_terminated_length": 643.4, | |
| "completions/mean_length": 266.9640625, | |
| "completions/mean_terminated_length": 266.9640625, | |
| "completions/min_length": 91.0, | |
| "completions/min_terminated_length": 91.0, | |
| "epoch": 0.7668576465403262, | |
| "grad_norm": 0.5721590281422019, | |
| "kl": 0.03896484375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0368, | |
| "num_tokens": 9827754.0, | |
| "reward": 0.6843396306037903, | |
| "reward_std": 0.17963839173316956, | |
| "rewards/format_reward/mean": 0.97265625, | |
| "rewards/format_reward/std": 0.15778429061174393, | |
| "rewards/qatch_metrics/mean": 0.6327593684196472, | |
| "rewards/qatch_metrics/std": 0.4007817268371582, | |
| "rewards/tag_count_reward/mean": 0.9845703125, | |
| "rewards/tag_count_reward/std": 0.08982480615377426, | |
| "step": 435 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00234375, | |
| "completions/max_length": 1994.0, | |
| "completions/max_terminated_length": 609.6, | |
| "completions/mean_length": 255.921875, | |
| "completions/mean_terminated_length": 246.93662109375, | |
| "completions/min_length": 86.0, | |
| "completions/min_terminated_length": 86.0, | |
| "epoch": 0.7756721022476862, | |
| "grad_norm": 0.4890398371699509, | |
| "kl": 0.036968994140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0486, | |
| "num_tokens": 10625846.0, | |
| "reward": 0.6650787591934204, | |
| "reward_std": 0.15730705261230468, | |
| "rewards/format_reward/mean": 0.971875, | |
| "rewards/format_reward/std": 0.160001802444458, | |
| "rewards/qatch_metrics/mean": 0.6100765824317932, | |
| "rewards/qatch_metrics/std": 0.4057386636734009, | |
| "rewards/tag_count_reward/mean": 0.9865234375, | |
| "rewards/tag_count_reward/std": 0.07955404669046402, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00078125, | |
| "completions/max_length": 1277.8, | |
| "completions/max_terminated_length": 572.0, | |
| "completions/mean_length": 232.64375, | |
| "completions/mean_terminated_length": 229.62958374023438, | |
| "completions/min_length": 79.2, | |
| "completions/min_terminated_length": 79.2, | |
| "epoch": 0.7844865579550463, | |
| "grad_norm": 0.4569982422088128, | |
| "kl": 0.041680908203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0222, | |
| "num_tokens": 11412830.0, | |
| "reward": 0.7003113865852356, | |
| "reward_std": 0.17285217940807343, | |
| "rewards/format_reward/mean": 0.9859375, | |
| "rewards/format_reward/std": 0.10446578860282899, | |
| "rewards/qatch_metrics/mean": 0.649527621269226, | |
| "rewards/qatch_metrics/std": 0.40458944439888, | |
| "rewards/tag_count_reward/mean": 0.9923828125, | |
| "rewards/tag_count_reward/std": 0.05678167305886746, | |
| "step": 445 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 600.6, | |
| "completions/max_terminated_length": 600.6, | |
| "completions/mean_length": 217.125, | |
| "completions/mean_terminated_length": 217.125, | |
| "completions/min_length": 77.8, | |
| "completions/min_terminated_length": 77.8, | |
| "epoch": 0.7933010136624064, | |
| "grad_norm": 0.4388317764117489, | |
| "kl": 0.04573974609375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0071, | |
| "num_tokens": 12168542.0, | |
| "reward": 0.7169292807579041, | |
| "reward_std": 0.14911916553974153, | |
| "rewards/format_reward/mean": 0.98203125, | |
| "rewards/format_reward/std": 0.1286988839507103, | |
| "rewards/qatch_metrics/mean": 0.6693882942199707, | |
| "rewards/qatch_metrics/std": 0.38276457190513613, | |
| "rewards/tag_count_reward/mean": 0.994921875, | |
| "rewards/tag_count_reward/std": 0.04127059616148472, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 611.2, | |
| "completions/max_terminated_length": 611.2, | |
| "completions/mean_length": 230.1046875, | |
| "completions/mean_terminated_length": 230.1046875, | |
| "completions/min_length": 79.4, | |
| "completions/min_terminated_length": 79.4, | |
| "epoch": 0.8021154693697664, | |
| "grad_norm": 0.5673920118721377, | |
| "kl": 0.04423828125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0256, | |
| "num_tokens": 12950596.0, | |
| "reward": 0.7063471436500549, | |
| "reward_std": 0.14635758399963378, | |
| "rewards/format_reward/mean": 0.9796875, | |
| "rewards/format_reward/std": 0.13933248221874237, | |
| "rewards/qatch_metrics/mean": 0.6573523283004761, | |
| "rewards/qatch_metrics/std": 0.39122379422187803, | |
| "rewards/tag_count_reward/mean": 0.992578125, | |
| "rewards/tag_count_reward/std": 0.06119627803564072, | |
| "step": 455 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 632.8, | |
| "completions/max_terminated_length": 632.8, | |
| "completions/mean_length": 246.2421875, | |
| "completions/mean_terminated_length": 246.2421875, | |
| "completions/min_length": 74.0, | |
| "completions/min_terminated_length": 74.0, | |
| "epoch": 0.8109299250771265, | |
| "grad_norm": 0.4954363135168558, | |
| "kl": 0.042694091796875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0033, | |
| "num_tokens": 13739658.0, | |
| "reward": 0.6477766692638397, | |
| "reward_std": 0.18786489367485046, | |
| "rewards/format_reward/mean": 0.9875, | |
| "rewards/format_reward/std": 0.09900134056806564, | |
| "rewards/qatch_metrics/mean": 0.5874463558197022, | |
| "rewards/qatch_metrics/std": 0.40233501195907595, | |
| "rewards/tag_count_reward/mean": 0.9939453125, | |
| "rewards/tag_count_reward/std": 0.045028040558099745, | |
| "step": 460 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 572.2, | |
| "completions/max_terminated_length": 572.2, | |
| "completions/mean_length": 248.615625, | |
| "completions/mean_terminated_length": 248.615625, | |
| "completions/min_length": 80.8, | |
| "completions/min_terminated_length": 80.8, | |
| "epoch": 0.8197443807844865, | |
| "grad_norm": 0.5126492434340917, | |
| "kl": 0.03770751953125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0263, | |
| "num_tokens": 14504254.0, | |
| "reward": 0.7320139050483704, | |
| "reward_std": 0.16122200787067414, | |
| "rewards/format_reward/mean": 0.975, | |
| "rewards/format_reward/std": 0.15241584777832032, | |
| "rewards/qatch_metrics/mean": 0.6882033824920655, | |
| "rewards/qatch_metrics/std": 0.38843331933021547, | |
| "rewards/tag_count_reward/mean": 0.9908203125, | |
| "rewards/tag_count_reward/std": 0.06024260520935058, | |
| "step": 465 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0015625, | |
| "completions/max_length": 1370.4, | |
| "completions/max_terminated_length": 681.4, | |
| "completions/mean_length": 258.55390625, | |
| "completions/mean_terminated_length": 252.59222412109375, | |
| "completions/min_length": 77.6, | |
| "completions/min_terminated_length": 77.6, | |
| "epoch": 0.8285588364918466, | |
| "grad_norm": 0.5253311047415303, | |
| "kl": 0.037872314453125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0406, | |
| "num_tokens": 15275139.0, | |
| "reward": 0.7564595103263855, | |
| "reward_std": 0.15985482782125474, | |
| "rewards/format_reward/mean": 0.98203125, | |
| "rewards/format_reward/std": 0.1299367517232895, | |
| "rewards/qatch_metrics/mean": 0.7159289240837097, | |
| "rewards/qatch_metrics/std": 0.3664227664470673, | |
| "rewards/tag_count_reward/mean": 0.9943359375, | |
| "rewards/tag_count_reward/std": 0.0471202090382576, | |
| "step": 470 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 635.8, | |
| "completions/max_terminated_length": 635.8, | |
| "completions/mean_length": 251.44296875, | |
| "completions/mean_terminated_length": 251.44296875, | |
| "completions/min_length": 69.4, | |
| "completions/min_terminated_length": 69.4, | |
| "epoch": 0.8373732921992068, | |
| "grad_norm": 0.48947707805356844, | |
| "kl": 0.042919921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0085, | |
| "num_tokens": 16030218.0, | |
| "reward": 0.7450597763061524, | |
| "reward_std": 0.16296629011631011, | |
| "rewards/format_reward/mean": 0.975, | |
| "rewards/format_reward/std": 0.15551186800003053, | |
| "rewards/qatch_metrics/mean": 0.7034825563430787, | |
| "rewards/qatch_metrics/std": 0.37498498558998106, | |
| "rewards/tag_count_reward/mean": 0.9919921875, | |
| "rewards/tag_count_reward/std": 0.058643939718604085, | |
| "step": 475 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 646.2, | |
| "completions/max_terminated_length": 646.2, | |
| "completions/mean_length": 259.84765625, | |
| "completions/mean_terminated_length": 259.84765625, | |
| "completions/min_length": 78.2, | |
| "completions/min_terminated_length": 78.2, | |
| "epoch": 0.8461877479065668, | |
| "grad_norm": 0.3729965015800276, | |
| "kl": 0.043310546875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0058, | |
| "num_tokens": 16845463.0, | |
| "reward": 0.727292287349701, | |
| "reward_std": 0.14453701674938202, | |
| "rewards/format_reward/mean": 0.9828125, | |
| "rewards/format_reward/std": 0.12775924652814866, | |
| "rewards/qatch_metrics/mean": 0.6817294478416442, | |
| "rewards/qatch_metrics/std": 0.38109866976737977, | |
| "rewards/tag_count_reward/mean": 0.9908203125, | |
| "rewards/tag_count_reward/std": 0.054720209538936616, | |
| "step": 480 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 589.8, | |
| "completions/max_terminated_length": 589.8, | |
| "completions/mean_length": 274.1375, | |
| "completions/mean_terminated_length": 274.1375, | |
| "completions/min_length": 98.0, | |
| "completions/min_terminated_length": 98.0, | |
| "epoch": 0.8550022036139269, | |
| "grad_norm": 0.5512010992625618, | |
| "kl": 0.03946533203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0001, | |
| "num_tokens": 17711207.0, | |
| "reward": 0.7116207957267762, | |
| "reward_std": 0.17875194251537324, | |
| "rewards/format_reward/mean": 0.96875, | |
| "rewards/format_reward/std": 0.1718300312757492, | |
| "rewards/qatch_metrics/mean": 0.6652799725532532, | |
| "rewards/qatch_metrics/std": 0.40575913786888124, | |
| "rewards/tag_count_reward/mean": 0.98515625, | |
| "rewards/tag_count_reward/std": 0.08101150617003441, | |
| "step": 485 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 654.0, | |
| "completions/max_terminated_length": 654.0, | |
| "completions/mean_length": 299.58359375, | |
| "completions/mean_terminated_length": 299.58359375, | |
| "completions/min_length": 88.6, | |
| "completions/min_terminated_length": 88.6, | |
| "epoch": 0.8638166593212869, | |
| "grad_norm": 0.43603860960108626, | |
| "kl": 0.035888671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0129, | |
| "num_tokens": 18561666.0, | |
| "reward": 0.7371966004371643, | |
| "reward_std": 0.15995949804782866, | |
| "rewards/format_reward/mean": 0.9703125, | |
| "rewards/format_reward/std": 0.16827207505702974, | |
| "rewards/qatch_metrics/mean": 0.6952198028564454, | |
| "rewards/qatch_metrics/std": 0.37691527009010317, | |
| "rewards/tag_count_reward/mean": 0.9845703125, | |
| "rewards/tag_count_reward/std": 0.08707116395235062, | |
| "step": 490 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00078125, | |
| "completions/max_length": 1342.4, | |
| "completions/max_terminated_length": 651.4, | |
| "completions/mean_length": 314.77578125, | |
| "completions/mean_terminated_length": 311.8182067871094, | |
| "completions/min_length": 90.8, | |
| "completions/min_terminated_length": 90.8, | |
| "epoch": 0.872631115028647, | |
| "grad_norm": 0.45452848376355987, | |
| "kl": 0.03583984375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0219, | |
| "num_tokens": 19430867.0, | |
| "reward": 0.7528019547462463, | |
| "reward_std": 0.16146388351917268, | |
| "rewards/format_reward/mean": 0.96015625, | |
| "rewards/format_reward/std": 0.19488056004047394, | |
| "rewards/qatch_metrics/mean": 0.7149921894073487, | |
| "rewards/qatch_metrics/std": 0.37755597829818727, | |
| "rewards/tag_count_reward/mean": 0.980859375, | |
| "rewards/tag_count_reward/std": 0.10050071328878403, | |
| "step": 495 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 713.8, | |
| "completions/max_terminated_length": 713.8, | |
| "completions/mean_length": 331.384375, | |
| "completions/mean_terminated_length": 331.384375, | |
| "completions/min_length": 119.0, | |
| "completions/min_terminated_length": 119.0, | |
| "epoch": 0.881445570736007, | |
| "grad_norm": 0.4108711355809565, | |
| "kl": 0.033282470703125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0067, | |
| "num_tokens": 20330031.0, | |
| "reward": 0.7109659552574158, | |
| "reward_std": 0.17206443548202516, | |
| "rewards/format_reward/mean": 0.97421875, | |
| "rewards/format_reward/std": 0.1577295958995819, | |
| "rewards/qatch_metrics/mean": 0.663659393787384, | |
| "rewards/qatch_metrics/std": 0.3938014984130859, | |
| "rewards/tag_count_reward/mean": 0.988671875, | |
| "rewards/tag_count_reward/std": 0.07903932370245456, | |
| "step": 500 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1202.0, | |
| "completions/max_terminated_length": 1202.0, | |
| "completions/mean_length": 308.16484375, | |
| "completions/mean_terminated_length": 308.16484375, | |
| "completions/min_length": 83.0, | |
| "completions/min_terminated_length": 83.0, | |
| "epoch": 0.8902600264433671, | |
| "grad_norm": 0.4164787884296458, | |
| "kl": 0.033551025390625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0022, | |
| "num_tokens": 21204274.0, | |
| "reward": 0.7315962195396424, | |
| "reward_std": 0.16765011548995973, | |
| "rewards/format_reward/mean": 0.98125, | |
| "rewards/format_reward/std": 0.13386803418397902, | |
| "rewards/qatch_metrics/mean": 0.6868273377418518, | |
| "rewards/qatch_metrics/std": 0.38292229175567627, | |
| "rewards/tag_count_reward/mean": 0.993359375, | |
| "rewards/tag_count_reward/std": 0.0591853179037571, | |
| "step": 505 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 970.8, | |
| "completions/max_terminated_length": 970.8, | |
| "completions/mean_length": 317.1265625, | |
| "completions/mean_terminated_length": 317.1265625, | |
| "completions/min_length": 104.2, | |
| "completions/min_terminated_length": 104.2, | |
| "epoch": 0.8990744821507272, | |
| "grad_norm": 0.5587175243501159, | |
| "kl": 0.034619140625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0174, | |
| "num_tokens": 22100356.0, | |
| "reward": 0.6716719269752502, | |
| "reward_std": 0.18899759352207185, | |
| "rewards/format_reward/mean": 0.98359375, | |
| "rewards/format_reward/std": 0.12365061938762664, | |
| "rewards/qatch_metrics/mean": 0.6159260630607605, | |
| "rewards/qatch_metrics/std": 0.39540442228317263, | |
| "rewards/tag_count_reward/mean": 0.9955078125, | |
| "rewards/tag_count_reward/std": 0.04045262522995472, | |
| "step": 510 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 772.6, | |
| "completions/max_terminated_length": 772.6, | |
| "completions/mean_length": 295.43671875, | |
| "completions/mean_terminated_length": 295.43671875, | |
| "completions/min_length": 93.4, | |
| "completions/min_terminated_length": 93.4, | |
| "epoch": 0.9078889378580872, | |
| "grad_norm": 0.4722324099985001, | |
| "kl": 0.03868408203125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0174, | |
| "num_tokens": 22932707.0, | |
| "reward": 0.756611955165863, | |
| "reward_std": 0.1691014885902405, | |
| "rewards/format_reward/mean": 0.978125, | |
| "rewards/format_reward/std": 0.14329043328762053, | |
| "rewards/qatch_metrics/mean": 0.7167171835899353, | |
| "rewards/qatch_metrics/std": 0.3885104775428772, | |
| "rewards/tag_count_reward/mean": 0.991796875, | |
| "rewards/tag_count_reward/std": 0.06754893809556961, | |
| "step": 515 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 604.0, | |
| "completions/max_terminated_length": 604.0, | |
| "completions/mean_length": 283.27421875, | |
| "completions/mean_terminated_length": 283.27421875, | |
| "completions/min_length": 85.6, | |
| "completions/min_terminated_length": 85.6, | |
| "epoch": 0.9167033935654474, | |
| "grad_norm": 0.43733718031543917, | |
| "kl": 0.03892822265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0147, | |
| "num_tokens": 23750770.0, | |
| "reward": 0.724392831325531, | |
| "reward_std": 0.15009717047214508, | |
| "rewards/format_reward/mean": 0.98984375, | |
| "rewards/format_reward/std": 0.09812660813331604, | |
| "rewards/qatch_metrics/mean": 0.6772153854370118, | |
| "rewards/qatch_metrics/std": 0.3725505113601685, | |
| "rewards/tag_count_reward/mean": 0.9955078125, | |
| "rewards/tag_count_reward/std": 0.04656890295445919, | |
| "step": 520 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 676.2, | |
| "completions/max_terminated_length": 676.2, | |
| "completions/mean_length": 274.715625, | |
| "completions/mean_terminated_length": 274.715625, | |
| "completions/min_length": 84.8, | |
| "completions/min_terminated_length": 84.8, | |
| "epoch": 0.9255178492728074, | |
| "grad_norm": 0.5164725190824285, | |
| "kl": 0.03759765625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0223, | |
| "num_tokens": 24580774.0, | |
| "reward": 0.7318256020545959, | |
| "reward_std": 0.14180095940828324, | |
| "rewards/format_reward/mean": 0.98984375, | |
| "rewards/format_reward/std": 0.08850486427545548, | |
| "rewards/qatch_metrics/mean": 0.685994279384613, | |
| "rewards/qatch_metrics/std": 0.388202303647995, | |
| "rewards/tag_count_reward/mean": 0.994921875, | |
| "rewards/tag_count_reward/std": 0.04503728672862053, | |
| "step": 525 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 641.2, | |
| "completions/max_terminated_length": 641.2, | |
| "completions/mean_length": 251.61015625, | |
| "completions/mean_terminated_length": 251.61015625, | |
| "completions/min_length": 78.2, | |
| "completions/min_terminated_length": 78.2, | |
| "epoch": 0.9343323049801675, | |
| "grad_norm": 0.569614505039085, | |
| "kl": 0.040081787109375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0277, | |
| "num_tokens": 25388227.0, | |
| "reward": 0.7570900201797486, | |
| "reward_std": 0.14922449886798858, | |
| "rewards/format_reward/mean": 0.98828125, | |
| "rewards/format_reward/std": 0.10359105616807937, | |
| "rewards/qatch_metrics/mean": 0.715946900844574, | |
| "rewards/qatch_metrics/std": 0.36956331729888914, | |
| "rewards/tag_count_reward/mean": 0.994140625, | |
| "rewards/tag_count_reward/std": 0.054824869334697726, | |
| "step": 530 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 723.2, | |
| "completions/max_terminated_length": 723.2, | |
| "completions/mean_length": 240.55078125, | |
| "completions/mean_terminated_length": 240.55078125, | |
| "completions/min_length": 78.0, | |
| "completions/min_terminated_length": 78.0, | |
| "epoch": 0.9431467606875276, | |
| "grad_norm": 0.49100420519700644, | |
| "kl": 0.04449462890625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0268, | |
| "num_tokens": 26175972.0, | |
| "reward": 0.7097227334976196, | |
| "reward_std": 0.14579529464244842, | |
| "rewards/format_reward/mean": 0.98359375, | |
| "rewards/format_reward/std": 0.1263234168291092, | |
| "rewards/qatch_metrics/mean": 0.6607836008071899, | |
| "rewards/qatch_metrics/std": 0.3822557330131531, | |
| "rewards/tag_count_reward/mean": 0.9939453125, | |
| "rewards/tag_count_reward/std": 0.045474790036678314, | |
| "step": 535 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 684.2, | |
| "completions/max_terminated_length": 684.2, | |
| "completions/mean_length": 248.60703125, | |
| "completions/mean_terminated_length": 248.60703125, | |
| "completions/min_length": 85.0, | |
| "completions/min_terminated_length": 85.0, | |
| "epoch": 0.9519612163948876, | |
| "grad_norm": 0.47774970067090894, | |
| "kl": 0.041180419921875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0049, | |
| "num_tokens": 26968429.0, | |
| "reward": 0.730446743965149, | |
| "reward_std": 0.15047829747200012, | |
| "rewards/format_reward/mean": 0.98203125, | |
| "rewards/format_reward/std": 0.1279981330037117, | |
| "rewards/qatch_metrics/mean": 0.6854750037193298, | |
| "rewards/qatch_metrics/std": 0.37677569389343263, | |
| "rewards/tag_count_reward/mean": 0.991796875, | |
| "rewards/tag_count_reward/std": 0.057506294548511507, | |
| "step": 540 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 777.8, | |
| "completions/max_terminated_length": 777.8, | |
| "completions/mean_length": 238.09453125, | |
| "completions/mean_terminated_length": 238.09453125, | |
| "completions/min_length": 79.2, | |
| "completions/min_terminated_length": 79.2, | |
| "epoch": 0.9607756721022477, | |
| "grad_norm": 0.524470238389979, | |
| "kl": 0.03973388671875, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0326, | |
| "num_tokens": 27752838.0, | |
| "reward": 0.7608234286308289, | |
| "reward_std": 0.1745920956134796, | |
| "rewards/format_reward/mean": 0.97890625, | |
| "rewards/format_reward/std": 0.13925887942314147, | |
| "rewards/qatch_metrics/mean": 0.7215799450874328, | |
| "rewards/qatch_metrics/std": 0.37039090394973756, | |
| "rewards/tag_count_reward/mean": 0.991796875, | |
| "rewards/tag_count_reward/std": 0.05863870121538639, | |
| "step": 545 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 642.0, | |
| "completions/max_terminated_length": 642.0, | |
| "completions/mean_length": 237.07265625, | |
| "completions/mean_terminated_length": 237.07265625, | |
| "completions/min_length": 77.0, | |
| "completions/min_terminated_length": 77.0, | |
| "epoch": 0.9695901278096077, | |
| "grad_norm": 0.46750955147071793, | |
| "kl": 0.0383056640625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0212, | |
| "num_tokens": 28539107.0, | |
| "reward": 0.7267768025398255, | |
| "reward_std": 0.12322149947285652, | |
| "rewards/format_reward/mean": 0.9828125, | |
| "rewards/format_reward/std": 0.11163707971572875, | |
| "rewards/qatch_metrics/mean": 0.6809966087341308, | |
| "rewards/qatch_metrics/std": 0.3821363866329193, | |
| "rewards/tag_count_reward/mean": 0.99296875, | |
| "rewards/tag_count_reward/std": 0.0495118897408247, | |
| "step": 550 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 617.6, | |
| "completions/max_terminated_length": 617.6, | |
| "completions/mean_length": 235.8828125, | |
| "completions/mean_terminated_length": 235.8828125, | |
| "completions/min_length": 72.0, | |
| "completions/min_terminated_length": 72.0, | |
| "epoch": 0.9784045835169678, | |
| "grad_norm": 0.4942028139716262, | |
| "kl": 0.039678955078125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0177, | |
| "num_tokens": 29326877.0, | |
| "reward": 0.727705979347229, | |
| "reward_std": 0.14997260570526122, | |
| "rewards/format_reward/mean": 0.98125, | |
| "rewards/format_reward/std": 0.1321229487657547, | |
| "rewards/qatch_metrics/mean": 0.682227611541748, | |
| "rewards/qatch_metrics/std": 0.37953501343727114, | |
| "rewards/tag_count_reward/mean": 0.99375, | |
| "rewards/tag_count_reward/std": 0.05912150144577026, | |
| "step": 555 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 594.6, | |
| "completions/max_terminated_length": 594.6, | |
| "completions/mean_length": 244.68984375, | |
| "completions/mean_terminated_length": 244.68984375, | |
| "completions/min_length": 83.4, | |
| "completions/min_terminated_length": 83.4, | |
| "epoch": 0.9872190392243279, | |
| "grad_norm": 0.5441854565755777, | |
| "kl": 0.039990234375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0097, | |
| "num_tokens": 30108320.0, | |
| "reward": 0.7335842967033386, | |
| "reward_std": 0.16434457302093505, | |
| "rewards/format_reward/mean": 0.9875, | |
| "rewards/format_reward/std": 0.11007042825222016, | |
| "rewards/qatch_metrics/mean": 0.6883505344390869, | |
| "rewards/qatch_metrics/std": 0.3857073485851288, | |
| "rewards/tag_count_reward/mean": 0.9947265625, | |
| "rewards/tag_count_reward/std": 0.04389902278780937, | |
| "step": 560 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.00078125, | |
| "completions/max_length": 1302.2, | |
| "completions/max_terminated_length": 623.4, | |
| "completions/mean_length": 263.50625, | |
| "completions/mean_terminated_length": 260.5096740722656, | |
| "completions/min_length": 74.0, | |
| "completions/min_terminated_length": 74.0, | |
| "epoch": 0.996033494931688, | |
| "grad_norm": 0.5329735167865763, | |
| "kl": 0.038818359375, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0176, | |
| "num_tokens": 30905272.0, | |
| "reward": 0.6684425473213196, | |
| "reward_std": 0.15172433108091354, | |
| "rewards/format_reward/mean": 0.9921875, | |
| "rewards/format_reward/std": 0.07656104415655136, | |
| "rewards/qatch_metrics/mean": 0.6110697865486145, | |
| "rewards/qatch_metrics/std": 0.40551244616508486, | |
| "rewards/tag_count_reward/mean": 0.9962890625, | |
| "rewards/tag_count_reward/std": 0.0363810945302248, | |
| "step": 565 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.001953125, | |
| "completions/max_length": 2299.5, | |
| "completions/max_terminated_length": 521.5, | |
| "completions/mean_length": 244.14453125, | |
| "completions/mean_terminated_length": 236.6223373413086, | |
| "completions/min_length": 69.5, | |
| "completions/min_terminated_length": 69.5, | |
| "epoch": 0.999559277214632, | |
| "kl": 0.03900146484375, | |
| "num_tokens": 31206578.0, | |
| "reward": 0.7890622317790985, | |
| "reward_std": 0.14110208302736282, | |
| "rewards/format_reward/mean": 0.982421875, | |
| "rewards/format_reward/std": 0.12580867484211922, | |
| "rewards/qatch_metrics/mean": 0.7542793154716492, | |
| "rewards/qatch_metrics/std": 0.3720303773880005, | |
| "rewards/tag_count_reward/mean": 0.99365234375, | |
| "rewards/tag_count_reward/std": 0.0528964027762413, | |
| "step": 567, | |
| "total_flos": 0.0, | |
| "train_loss": 0.0021981382561844284, | |
| "train_runtime": 12830.8133, | |
| "train_samples_per_second": 0.707, | |
| "train_steps_per_second": 0.044 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 567, | |
| "num_input_tokens_seen": 31206578, | |
| "num_train_epochs": 1, | |
| "save_steps": 5, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |