diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3371 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999559277214632, + "eval_steps": 500, + "global_step": 567, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 175.50390625, + "completions/mean_terminated_length": 175.50390625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.0017628911414720142, + "grad_norm": 1.0880173896572545, + "kl": 0.0, + "learning_rate": 0.0, + "loss": -0.327, + "num_tokens": 129409.0, + "reward": 0.814777672290802, + "reward_std": 0.14736539125442505, + "rewards/format_reward/mean": 0.68359375, + "rewards/format_reward/std": 0.4659844934940338, + "rewards/qatch_metrics/mean": 0.8332747220993042, + "rewards/qatch_metrics/std": 0.3284282088279724, + "rewards/tag_count_reward/mean": 0.7626953125, + "rewards/tag_count_reward/std": 0.34948837757110596, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 177.318359375, + "completions/mean_terminated_length": 177.318359375, + "completions/min_length": 21.5, + "completions/min_terminated_length": 21.5, + "epoch": 0.00881445570736007, + "grad_norm": 0.9499188530188546, + "kl": 0.00019824504852294922, + "learning_rate": 7.017543859649122e-08, + "loss": -0.2902, + "num_tokens": 685703.0, + "reward": 0.762174516916275, + "reward_std": 0.15002675727009773, + "rewards/format_reward/mean": 0.7265625, + "rewards/format_reward/std": 0.4450720399618149, + "rewards/qatch_metrics/mean": 0.7644235193729401, + "rewards/qatch_metrics/std": 0.3610532283782959, + "rewards/tag_count_reward/mean": 0.795166015625, + "rewards/tag_count_reward/std": 0.33385463058948517, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.8, + "completions/max_terminated_length": 438.8, + "completions/mean_length": 173.41171875, + "completions/mean_terminated_length": 173.41171875, + "completions/min_length": 21.8, + "completions/min_terminated_length": 21.8, + "epoch": 0.01762891141472014, + "grad_norm": 0.9346895582900878, + "kl": 0.00028295516967773436, + "learning_rate": 1.5789473684210525e-07, + "loss": -0.2591, + "num_tokens": 1398566.0, + "reward": 0.7710299372673035, + "reward_std": 0.1539353460073471, + "rewards/format_reward/mean": 0.71796875, + "rewards/format_reward/std": 0.4487275779247284, + "rewards/qatch_metrics/mean": 0.7762346506118775, + "rewards/qatch_metrics/std": 0.3281721532344818, + "rewards/tag_count_reward/mean": 0.788671875, + "rewards/tag_count_reward/std": 0.33627479076385497, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.2, + "completions/max_terminated_length": 438.2, + "completions/mean_length": 183.1796875, + "completions/mean_terminated_length": 183.1796875, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.026443367122080213, + "grad_norm": 0.7943318239924386, + "kl": 0.00037631988525390627, + "learning_rate": 2.456140350877193e-07, + "loss": -0.2603, + "num_tokens": 2071996.0, + "reward": 0.7256837129592896, + "reward_std": 0.12991088777780532, + "rewards/format_reward/mean": 0.765625, + "rewards/format_reward/std": 0.4240167737007141, + "rewards/qatch_metrics/mean": 0.7151770830154419, + "rewards/qatch_metrics/std": 0.37596395611763, + "rewards/tag_count_reward/mean": 0.8244140625, + "rewards/tag_count_reward/std": 0.31790287494659425, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.6, + "completions/max_terminated_length": 479.6, + "completions/mean_length": 201.30234375, + "completions/mean_terminated_length": 201.30234375, + "completions/min_length": 21.2, + "completions/min_terminated_length": 21.2, + "epoch": 0.03525782282944028, + "grad_norm": 0.4721344642723057, + "kl": 0.00091400146484375, + "learning_rate": 3.333333333333333e-07, + "loss": -0.1315, + "num_tokens": 2791247.0, + "reward": 0.8173989057540894, + "reward_std": 0.12794919013977052, + "rewards/format_reward/mean": 0.89765625, + "rewards/format_reward/std": 0.29814977645874025, + "rewards/qatch_metrics/mean": 0.8017192721366883, + "rewards/qatch_metrics/std": 0.331482595205307, + "rewards/tag_count_reward/mean": 0.9234375, + "rewards/tag_count_reward/std": 0.22307254374027252, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.2, + "completions/max_terminated_length": 432.2, + "completions/mean_length": 221.4625, + "completions/mean_terminated_length": 221.4625, + "completions/min_length": 51.4, + "completions/min_terminated_length": 51.4, + "epoch": 0.044072278536800354, + "grad_norm": 0.29592079815815686, + "kl": 0.0016038894653320312, + "learning_rate": 4.2105263157894733e-07, + "loss": -0.0424, + "num_tokens": 3536975.0, + "reward": 0.7564297676086426, + "reward_std": 0.08200130835175515, + "rewards/format_reward/mean": 0.96953125, + "rewards/format_reward/std": 0.13422587364912034, + "rewards/qatch_metrics/mean": 0.7183640837669373, + "rewards/qatch_metrics/std": 0.3674669623374939, + "rewards/tag_count_reward/mean": 0.97734375, + "rewards/tag_count_reward/std": 0.09909781143069267, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.6, + "completions/max_terminated_length": 445.6, + "completions/mean_length": 216.53984375, + "completions/mean_terminated_length": 216.53984375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.052886734244160426, + "grad_norm": 0.275794455786416, + "kl": 0.0034694671630859375, + "learning_rate": 5.087719298245614e-07, + "loss": 0.002, + "num_tokens": 4281330.0, + "reward": 0.7764788866043091, + "reward_std": 0.09769791960716248, + "rewards/format_reward/mean": 0.9953125, + "rewards/format_reward/std": 0.06028594672679901, + "rewards/qatch_metrics/mean": 0.7377692699432373, + "rewards/qatch_metrics/std": 0.3548368811607361, + "rewards/tag_count_reward/mean": 0.996875, + "rewards/tag_count_reward/std": 0.04124387204647064, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.8, + "completions/max_terminated_length": 445.8, + "completions/mean_length": 220.11796875, + "completions/mean_terminated_length": 220.11796875, + "completions/min_length": 59.8, + "completions/min_terminated_length": 59.8, + "epoch": 0.06170118995152049, + "grad_norm": 0.2691159080285212, + "kl": 0.005501174926757812, + "learning_rate": 5.964912280701754e-07, + "loss": -0.0083, + "num_tokens": 5008025.0, + "reward": 0.8268720507621765, + "reward_std": 0.08243840038776398, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "rewards/qatch_metrics/mean": 0.7969059944152832, + "rewards/qatch_metrics/std": 0.30500164330005647, + "rewards/tag_count_reward/mean": 0.9978515625, + "rewards/tag_count_reward/std": 0.03437500074505806, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.2, + "completions/max_terminated_length": 487.2, + "completions/mean_length": 227.76015625, + "completions/mean_terminated_length": 227.76015625, + "completions/min_length": 83.4, + "completions/min_terminated_length": 83.4, + "epoch": 0.07051564565888056, + "grad_norm": 0.33908836616855625, + "kl": 0.002800750732421875, + "learning_rate": 6.842105263157895e-07, + "loss": 0.0002, + "num_tokens": 5774806.0, + "reward": 0.7647829532623291, + "reward_std": 0.09533883556723595, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.025, + "rewards/qatch_metrics/mean": 0.7235268354415894, + "rewards/qatch_metrics/std": 0.35323665738105775, + "rewards/tag_count_reward/mean": 0.998828125, + "rewards/tag_count_reward/std": 0.01875, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.2, + "completions/max_terminated_length": 476.2, + "completions/mean_length": 221.7984375, + "completions/mean_terminated_length": 221.7984375, + "completions/min_length": 83.4, + "completions/min_terminated_length": 83.4, + "epoch": 0.07933010136624064, + "grad_norm": 0.3262303740341099, + "kl": 0.00310516357421875, + "learning_rate": 7.719298245614034e-07, + "loss": 0.0104, + "num_tokens": 6557268.0, + "reward": 0.7565465092658996, + "reward_std": 0.09911727011203766, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7135841250419617, + "rewards/qatch_metrics/std": 0.37862626910209657, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.8, + "completions/max_terminated_length": 512.8, + "completions/mean_length": 228.45546875, + "completions/mean_terminated_length": 228.45546875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.08814455707360071, + "grad_norm": 0.23276410584015308, + "kl": 0.00273895263671875, + "learning_rate": 8.596491228070175e-07, + "loss": -0.0018, + "num_tokens": 7327499.0, + "reward": 0.7988326072692871, + "reward_std": 0.06667622029781342, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.025, + "rewards/qatch_metrics/mean": 0.7635622501373291, + "rewards/qatch_metrics/std": 0.369570130109787, + "rewards/tag_count_reward/mean": 0.99921875, + "rewards/tag_count_reward/std": 0.0125, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.8, + "completions/max_terminated_length": 483.8, + "completions/mean_length": 220.52734375, + "completions/mean_terminated_length": 220.52734375, + "completions/min_length": 81.2, + "completions/min_terminated_length": 81.2, + "epoch": 0.09695901278096078, + "grad_norm": 0.28218074028465906, + "kl": 0.00196533203125, + "learning_rate": 9.473684210526315e-07, + "loss": -0.0021, + "num_tokens": 8077390.0, + "reward": 0.8159880757331848, + "reward_std": 0.10231453701853752, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7835153818130494, + "rewards/qatch_metrics/std": 0.33782891631126405, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.2, + "completions/max_terminated_length": 481.2, + "completions/mean_length": 223.60703125, + "completions/mean_terminated_length": 223.60703125, + "completions/min_length": 75.6, + "completions/min_terminated_length": 75.6, + "epoch": 0.10577346848832085, + "grad_norm": 0.23258401790732933, + "kl": 0.00223388671875, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 8800407.0, + "reward": 0.74871985912323, + "reward_std": 0.09312780797481537, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7043763160705566, + "rewards/qatch_metrics/std": 0.39227073788642886, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.8, + "completions/max_terminated_length": 487.8, + "completions/mean_length": 222.81015625, + "completions/mean_terminated_length": 222.81015625, + "completions/min_length": 77.4, + "completions/min_terminated_length": 77.4, + "epoch": 0.11458792419568092, + "grad_norm": 0.22445170455470606, + "kl": 0.002956390380859375, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 9557380.0, + "reward": 0.8077908515930176, + "reward_std": 0.09828853458166123, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.025, + "rewards/qatch_metrics/mean": 0.774078369140625, + "rewards/qatch_metrics/std": 0.33206661343574523, + "rewards/tag_count_reward/mean": 0.999609375, + "rewards/tag_count_reward/std": 0.00625, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.6, + "completions/max_terminated_length": 492.6, + "completions/mean_length": 231.83984375, + "completions/mean_terminated_length": 231.83984375, + "completions/min_length": 94.6, + "completions/min_terminated_length": 94.6, + "epoch": 0.12340237990304098, + "grad_norm": 0.22832903725685313, + "kl": 0.00381317138671875, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 10339127.0, + "reward": 0.7895300030708313, + "reward_std": 0.10415169298648834, + "rewards/format_reward/mean": 0.9984375, + "rewards/format_reward/std": 0.025, + "rewards/qatch_metrics/mean": 0.7526065230369567, + "rewards/qatch_metrics/std": 0.3542828977108002, + "rewards/tag_count_reward/mean": 0.9994140625, + "rewards/tag_count_reward/std": 0.009375, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 521.8, + "completions/max_terminated_length": 521.8, + "completions/mean_length": 236.3125, + "completions/mean_terminated_length": 236.3125, + "completions/min_length": 80.4, + "completions/min_terminated_length": 80.4, + "epoch": 0.13221683561040107, + "grad_norm": 0.2597151805235052, + "kl": 0.00432281494140625, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 11147287.0, + "reward": 0.7333161950111389, + "reward_std": 0.08832715749740601, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.6862887978553772, + "rewards/qatch_metrics/std": 0.36336439847946167, + "rewards/tag_count_reward/mean": 0.9994140625, + "rewards/tag_count_reward/std": 0.0069767430424690245, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.6, + "completions/max_terminated_length": 445.6, + "completions/mean_length": 216.43984375, + "completions/mean_terminated_length": 216.43984375, + "completions/min_length": 87.8, + "completions/min_terminated_length": 87.8, + "epoch": 0.14103129131776113, + "grad_norm": 0.2463929158667687, + "kl": 0.00528717041015625, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 11891066.0, + "reward": 0.8300724029541016, + "reward_std": 0.09615504890680313, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8000851631164551, + "rewards/qatch_metrics/std": 0.3208737909793854, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.6, + "completions/max_terminated_length": 491.6, + "completions/mean_length": 225.32890625, + "completions/mean_terminated_length": 225.32890625, + "completions/min_length": 86.2, + "completions/min_terminated_length": 86.2, + "epoch": 0.1498457470251212, + "grad_norm": 0.22719354366888944, + "kl": 0.005328369140625, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 12668159.0, + "reward": 0.816937243938446, + "reward_std": 0.08283708170056343, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7846320390701294, + "rewards/qatch_metrics/std": 0.32469419240951536, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.4, + "completions/max_terminated_length": 460.4, + "completions/mean_length": 217.92890625, + "completions/mean_terminated_length": 217.92890625, + "completions/min_length": 76.2, + "completions/min_terminated_length": 76.2, + "epoch": 0.15866020273248127, + "grad_norm": 0.2721517170479785, + "kl": 0.00579071044921875, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 13413588.0, + "reward": 0.7426301956176757, + "reward_std": 0.0905102699995041, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.6972119808197021, + "rewards/qatch_metrics/std": 0.37120566368103025, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.6, + "completions/max_terminated_length": 428.6, + "completions/mean_length": 204.6640625, + "completions/mean_terminated_length": 204.6640625, + "completions/min_length": 75.6, + "completions/min_terminated_length": 75.6, + "epoch": 0.16747465843984133, + "grad_norm": 0.2525985499058037, + "kl": 0.0056243896484375, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 14111606.0, + "reward": 0.7979554295539856, + "reward_std": 0.06609301418066024, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7623119950294495, + "rewards/qatch_metrics/std": 0.34469759464263916, + "rewards/tag_count_reward/mean": 0.9998046875, + "rewards/tag_count_reward/std": 0.003125, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 212.34765625, + "completions/mean_terminated_length": 212.34765625, + "completions/min_length": 69.2, + "completions/min_terminated_length": 69.2, + "epoch": 0.17628911414720141, + "grad_norm": 0.30357672091416305, + "kl": 0.0057861328125, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 14876659.0, + "reward": 0.7724857568740845, + "reward_std": 0.09265935122966766, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7323476672172546, + "rewards/qatch_metrics/std": 0.33567925691604616, + "rewards/tag_count_reward/mean": 0.9998046875, + "rewards/tag_count_reward/std": 0.003125, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.4, + "completions/max_terminated_length": 463.4, + "completions/mean_length": 216.46875, + "completions/mean_terminated_length": 216.46875, + "completions/min_length": 80.4, + "completions/min_terminated_length": 80.4, + "epoch": 0.18510356985456147, + "grad_norm": 0.23780324977532238, + "kl": 0.0056549072265625, + "learning_rate": 1e-06, + "loss": -0.0087, + "num_tokens": 15600331.0, + "reward": 0.7508906722068787, + "reward_std": 0.0951332688331604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7069302201271057, + "rewards/qatch_metrics/std": 0.38108278512954713, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.4, + "completions/max_terminated_length": 442.4, + "completions/mean_length": 216.578125, + "completions/mean_terminated_length": 216.578125, + "completions/min_length": 80.2, + "completions/min_terminated_length": 80.2, + "epoch": 0.19391802556192156, + "grad_norm": 0.21716869090526136, + "kl": 0.0054229736328125, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 16326015.0, + "reward": 0.8402611017227173, + "reward_std": 0.05716411247849464, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8120718836784363, + "rewards/qatch_metrics/std": 0.2929441839456558, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.2, + "completions/max_terminated_length": 428.2, + "completions/mean_length": 222.0265625, + "completions/mean_terminated_length": 222.0265625, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.20273248126928162, + "grad_norm": 0.22835452896575356, + "kl": 0.0060882568359375, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 17091921.0, + "reward": 0.8265595078468323, + "reward_std": 0.07398260906338691, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7959523558616638, + "rewards/qatch_metrics/std": 0.3277123510837555, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.8, + "completions/max_terminated_length": 459.8, + "completions/mean_length": 220.7453125, + "completions/mean_terminated_length": 220.7453125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "epoch": 0.2115469369766417, + "grad_norm": 0.22726862373109216, + "kl": 0.006689453125, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 17877371.0, + "reward": 0.8397867679595947, + "reward_std": 0.09087342023849487, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8115137934684753, + "rewards/qatch_metrics/std": 0.3017837733030319, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.4, + "completions/max_terminated_length": 491.4, + "completions/mean_length": 225.2140625, + "completions/mean_terminated_length": 225.2140625, + "completions/min_length": 75.4, + "completions/min_terminated_length": 75.4, + "epoch": 0.22036139268400176, + "grad_norm": 0.2004953082769917, + "kl": 0.00776519775390625, + "learning_rate": 1e-06, + "loss": -0.0056, + "num_tokens": 18623005.0, + "reward": 0.8202541828155517, + "reward_std": 0.07537120208144188, + "rewards/format_reward/mean": 0.99921875, + "rewards/format_reward/std": 0.0125, + "rewards/qatch_metrics/mean": 0.7886492252349854, + "rewards/qatch_metrics/std": 0.32776339948177335, + "rewards/tag_count_reward/mean": 0.999609375, + "rewards/tag_count_reward/std": 0.00625, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.8, + "completions/max_terminated_length": 456.8, + "completions/mean_length": 223.48203125, + "completions/mean_terminated_length": 223.48203125, + "completions/min_length": 78.2, + "completions/min_terminated_length": 78.2, + "epoch": 0.22917584839136185, + "grad_norm": 0.2341532579835068, + "kl": 0.00804290771484375, + "learning_rate": 1e-06, + "loss": 0.0096, + "num_tokens": 19349606.0, + "reward": 0.8026262044906616, + "reward_std": 0.06839245334267616, + "rewards/format_reward/mean": 0.99921875, + "rewards/format_reward/std": 0.0125, + "rewards/qatch_metrics/mean": 0.7679218888282776, + "rewards/qatch_metrics/std": 0.3324147403240204, + "rewards/tag_count_reward/mean": 0.9994140625, + "rewards/tag_count_reward/std": 0.009375, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.4, + "completions/max_terminated_length": 458.4, + "completions/mean_length": 216.72578125, + "completions/mean_terminated_length": 216.72578125, + "completions/min_length": 86.2, + "completions/min_terminated_length": 86.2, + "epoch": 0.2379903040987219, + "grad_norm": 0.23655650548465582, + "kl": 0.0078033447265625, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 20092311.0, + "reward": 0.8197526335716248, + "reward_std": 0.0839143767952919, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7879442930221557, + "rewards/qatch_metrics/std": 0.3431123554706573, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.6, + "completions/max_terminated_length": 454.6, + "completions/mean_length": 204.48984375, + "completions/mean_terminated_length": 204.48984375, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.24680475980608196, + "grad_norm": 0.2641797202959811, + "kl": 0.00862884521484375, + "learning_rate": 1e-06, + "loss": 0.0051, + "num_tokens": 20821962.0, + "reward": 0.8242111682891846, + "reward_std": 0.07407020255923272, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7931895971298217, + "rewards/qatch_metrics/std": 0.3176054835319519, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.2, + "completions/max_terminated_length": 443.2, + "completions/mean_length": 203.590625, + "completions/mean_terminated_length": 203.590625, + "completions/min_length": 86.6, + "completions/min_terminated_length": 86.6, + "epoch": 0.255619215513442, + "grad_norm": 0.263066002535131, + "kl": 0.009637451171875, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 21526046.0, + "reward": 0.7875781059265137, + "reward_std": 0.09901705384254456, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7501148462295533, + "rewards/qatch_metrics/std": 0.3672972857952118, + "rewards/tag_count_reward/mean": 0.999609375, + "rewards/tag_count_reward/std": 0.00625, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.4, + "completions/max_terminated_length": 448.4, + "completions/mean_length": 208.90546875, + "completions/mean_terminated_length": 208.90546875, + "completions/min_length": 73.2, + "completions/min_terminated_length": 73.2, + "epoch": 0.26443367122080214, + "grad_norm": 0.2798500312218402, + "kl": 0.01026153564453125, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 22271333.0, + "reward": 0.818337082862854, + "reward_std": 0.07784928977489472, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7862788915634156, + "rewards/qatch_metrics/std": 0.3341992735862732, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.2, + "completions/max_terminated_length": 494.2, + "completions/mean_length": 209.651953125, + "completions/mean_terminated_length": 209.651953125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.5464962538563244, + "grad_norm": 0.2122029879190087, + "kl": 0.010993194580078126, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 23726666.0, + "reward": 0.811666476726532, + "reward_std": 0.0841904804110527, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7784311413764954, + "rewards/qatch_metrics/std": 0.32770459055900575, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.6, + "completions/max_terminated_length": 452.6, + "completions/mean_length": 217.9859375, + "completions/mean_terminated_length": 217.9859375, + "completions/min_length": 75.8, + "completions/min_terminated_length": 75.8, + "epoch": 0.5641251652710445, + "grad_norm": 0.15403477284537095, + "kl": 0.00980377197265625, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 25239750.0, + "reward": 0.7868865132331848, + "reward_std": 0.07244862839579583, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7492782354354859, + "rewards/qatch_metrics/std": 0.3493395745754242, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 208.38984375, + "completions/mean_terminated_length": 208.38984375, + "completions/min_length": 58.4, + "completions/min_terminated_length": 58.4, + "epoch": 0.5817540766857646, + "grad_norm": 0.18706575889421317, + "kl": 0.00914154052734375, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 26687596.0, + "reward": 0.828769075870514, + "reward_std": 0.07729479111731052, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7985518336296081, + "rewards/qatch_metrics/std": 0.29670341312885284, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 206.281640625, + "completions/mean_terminated_length": 206.281640625, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.5993829881004848, + "grad_norm": 0.19776450858978561, + "kl": 0.01090240478515625, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 28175773.0, + "reward": 0.8511051416397095, + "reward_std": 0.07431531846523284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8248295664787293, + "rewards/qatch_metrics/std": 0.3192874014377594, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.2, + "completions/max_terminated_length": 462.2, + "completions/mean_length": 219.3890625, + "completions/mean_terminated_length": 219.3890625, + "completions/min_length": 70.8, + "completions/min_terminated_length": 70.8, + "epoch": 0.617011899515205, + "grad_norm": 0.15290022120008429, + "kl": 0.01065216064453125, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 29739969.0, + "reward": 0.8426113128662109, + "reward_std": 0.09004694148898125, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.814836847782135, + "rewards/qatch_metrics/std": 0.309688937664032, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.8, + "completions/max_terminated_length": 458.8, + "completions/mean_length": 211.655078125, + "completions/mean_terminated_length": 211.655078125, + "completions/min_length": 73.4, + "completions/min_terminated_length": 73.4, + "epoch": 0.6346408109299251, + "grad_norm": 0.17923424569681315, + "kl": 0.0114501953125, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 31191502.0, + "reward": 0.8262084484100342, + "reward_std": 0.08637549504637718, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7955393195152283, + "rewards/qatch_metrics/std": 0.3134327620267868, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.2, + "completions/max_terminated_length": 499.2, + "completions/mean_length": 215.95546875, + "completions/mean_terminated_length": 215.95546875, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "epoch": 0.6522697223446452, + "grad_norm": 0.1321015357675111, + "kl": 0.012237548828125, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 32694108.0, + "reward": 0.7994898676872253, + "reward_std": 0.08254800513386726, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.764105749130249, + "rewards/qatch_metrics/std": 0.3532308578491211, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.2, + "completions/max_terminated_length": 442.2, + "completions/mean_length": 209.90078125, + "completions/mean_terminated_length": 209.90078125, + "completions/min_length": 76.6, + "completions/min_terminated_length": 76.6, + "epoch": 0.6698986337593653, + "grad_norm": 0.22256806005967145, + "kl": 0.01057586669921875, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 34144670.0, + "reward": 0.7911163926124573, + "reward_std": 0.06518566869199276, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7542545795440674, + "rewards/qatch_metrics/std": 0.35398219227790834, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.2, + "completions/max_terminated_length": 476.2, + "completions/mean_length": 208.534765625, + "completions/mean_terminated_length": 208.534765625, + "completions/min_length": 77.8, + "completions/min_terminated_length": 77.8, + "epoch": 0.6875275451740855, + "grad_norm": 0.17237028945675698, + "kl": 0.0087860107421875, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 35620023.0, + "reward": 0.8418472170829773, + "reward_std": 0.08243692219257355, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8139379024505615, + "rewards/qatch_metrics/std": 0.336453515291214, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.6, + "completions/max_terminated_length": 514.6, + "completions/mean_length": 217.3328125, + "completions/mean_terminated_length": 217.3328125, + "completions/min_length": 90.4, + "completions/min_terminated_length": 90.4, + "epoch": 0.7051564565888057, + "grad_norm": 0.19274445010407998, + "kl": 0.009130859375, + "learning_rate": 1e-06, + "loss": 0.0053, + "num_tokens": 37166635.0, + "reward": 0.8295193314552307, + "reward_std": 0.06927115023136139, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7994345307350159, + "rewards/qatch_metrics/std": 0.3011426508426666, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 499.8, + "completions/max_terminated_length": 499.8, + "completions/mean_length": 212.651171875, + "completions/mean_terminated_length": 212.651171875, + "completions/min_length": 68.6, + "completions/min_terminated_length": 68.6, + "epoch": 0.7227853680035258, + "grad_norm": 0.13990900967805797, + "kl": 0.0087432861328125, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 38617966.0, + "reward": 0.8151894211769104, + "reward_std": 0.07495353966951371, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7825757980346679, + "rewards/qatch_metrics/std": 0.33874245882034304, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.4, + "completions/max_terminated_length": 560.4, + "completions/mean_length": 223.7015625, + "completions/mean_terminated_length": 223.7015625, + "completions/min_length": 74.6, + "completions/min_terminated_length": 74.6, + "epoch": 0.7404142794182459, + "grad_norm": 0.20163985914598806, + "kl": 0.00806884765625, + "learning_rate": 1e-06, + "loss": 0.0054, + "num_tokens": 40092050.0, + "reward": 0.8460610270500183, + "reward_std": 0.05867695920169354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8188953161239624, + "rewards/qatch_metrics/std": 0.3239317536354065, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.6, + "completions/max_terminated_length": 486.6, + "completions/mean_length": 215.6828125, + "completions/mean_terminated_length": 215.6828125, + "completions/min_length": 82.2, + "completions/min_terminated_length": 82.2, + "epoch": 0.7580431908329661, + "grad_norm": 0.17564998217230318, + "kl": 0.009525299072265625, + "learning_rate": 1e-06, + "loss": -0.0034, + "num_tokens": 41565542.0, + "reward": 0.799136507511139, + "reward_std": 0.06419738680124283, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7636899828910828, + "rewards/qatch_metrics/std": 0.3342160403728485, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.4, + "completions/max_terminated_length": 503.4, + "completions/mean_length": 233.409765625, + "completions/mean_terminated_length": 233.409765625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.7756721022476862, + "grad_norm": 0.19283324501226842, + "kl": 0.009130096435546875, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 43081919.0, + "reward": 0.7851791024208069, + "reward_std": 0.07570808604359627, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7472695469856262, + "rewards/qatch_metrics/std": 0.36822828054428103, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.4, + "completions/max_terminated_length": 448.4, + "completions/mean_length": 224.728125, + "completions/mean_terminated_length": 224.728125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.7933010136624064, + "grad_norm": 0.17754847688569442, + "kl": 0.009470367431640625, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 44606439.0, + "reward": 0.8152384400367737, + "reward_std": 0.09764492362737656, + "rewards/format_reward/mean": 0.999609375, + "rewards/format_reward/std": 0.00883883461356163, + "rewards/qatch_metrics/mean": 0.7826851725578308, + "rewards/qatch_metrics/std": 0.3263732075691223, + "rewards/tag_count_reward/mean": 0.99990234375, + "rewards/tag_count_reward/std": 0.0022097086533904076, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.2, + "completions/max_terminated_length": 503.2, + "completions/mean_length": 218.58203125, + "completions/mean_terminated_length": 218.58203125, + "completions/min_length": 69.6, + "completions/min_terminated_length": 69.6, + "epoch": 0.8109299250771265, + "grad_norm": 0.19017267970498908, + "kl": 0.009508514404296875, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 46095257.0, + "reward": 0.8068280577659607, + "reward_std": 0.0781441181898117, + "rewards/format_reward/mean": 0.999609375, + "rewards/format_reward/std": 0.00883883461356163, + "rewards/qatch_metrics/mean": 0.7728020906448364, + "rewards/qatch_metrics/std": 0.3386655867099762, + "rewards/tag_count_reward/mean": 0.99970703125, + "rewards/tag_count_reward/std": 0.006629125773906707, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.6, + "completions/max_terminated_length": 456.6, + "completions/mean_length": 204.84375, + "completions/mean_terminated_length": 204.84375, + "completions/min_length": 72.6, + "completions/min_terminated_length": 72.6, + "epoch": 0.8285588364918466, + "grad_norm": 0.1678878918468119, + "kl": 0.009729766845703125, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 47519433.0, + "reward": 0.8672606706619262, + "reward_std": 0.0644603468477726, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8438360691070557, + "rewards/qatch_metrics/std": 0.2717843741178513, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.2, + "completions/max_terminated_length": 526.2, + "completions/mean_length": 214.90625, + "completions/mean_terminated_length": 214.90625, + "completions/min_length": 66.8, + "completions/min_terminated_length": 66.8, + "epoch": 0.8461877479065668, + "grad_norm": 0.18169011669761398, + "kl": 0.01288604736328125, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 48943993.0, + "reward": 0.8558493018150329, + "reward_std": 0.07027828097343444, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8304109454154969, + "rewards/qatch_metrics/std": 0.301141357421875, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.6, + "completions/max_terminated_length": 472.6, + "completions/mean_length": 212.559765625, + "completions/mean_terminated_length": 212.559765625, + "completions/min_length": 76.8, + "completions/min_terminated_length": 76.8, + "epoch": 0.8638166593212869, + "grad_norm": 0.2046340854229955, + "kl": 0.01494140625, + "learning_rate": 1e-06, + "loss": 0.006, + "num_tokens": 50416114.0, + "reward": 0.831060528755188, + "reward_std": 0.07754805404692888, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8012476563453674, + "rewards/qatch_metrics/std": 0.3293557226657867, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.2, + "completions/max_terminated_length": 503.2, + "completions/mean_length": 222.1375, + "completions/mean_terminated_length": 222.1375, + "completions/min_length": 83.2, + "completions/min_terminated_length": 83.2, + "epoch": 0.881445570736007, + "grad_norm": 0.15161264539796646, + "kl": 0.0138031005859375, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 51932274.0, + "reward": 0.8422249555587769, + "reward_std": 0.06234893724322319, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8143823027610779, + "rewards/qatch_metrics/std": 0.2993943512439728, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.6, + "completions/max_terminated_length": 463.6, + "completions/mean_length": 231.19609375, + "completions/mean_terminated_length": 231.19609375, + "completions/min_length": 77.4, + "completions/min_terminated_length": 77.4, + "epoch": 0.8990744821507272, + "grad_norm": 0.20035266636054513, + "kl": 0.011871337890625, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 53450248.0, + "reward": 0.8096501588821411, + "reward_std": 0.06698438860476016, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7760589838027954, + "rewards/qatch_metrics/std": 0.3199191153049469, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.2, + "completions/max_terminated_length": 471.2, + "completions/mean_length": 237.80234375, + "completions/mean_terminated_length": 237.80234375, + "completions/min_length": 82.2, + "completions/min_terminated_length": 82.2, + "epoch": 0.9167033935654474, + "grad_norm": 0.0856229450795828, + "kl": 0.011614227294921875, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 54970542.0, + "reward": 0.8725608706474304, + "reward_std": 0.051827043667435645, + "rewards/format_reward/mean": 0.999609375, + "rewards/format_reward/std": 0.00883883461356163, + "rewards/qatch_metrics/mean": 0.8501232981681823, + "rewards/qatch_metrics/std": 0.26386110931634904, + "rewards/tag_count_reward/mean": 0.99990234375, + "rewards/tag_count_reward/std": 0.0022097086533904076, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.6, + "completions/max_terminated_length": 476.6, + "completions/mean_length": 231.53671875, + "completions/mean_terminated_length": 231.53671875, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.9343323049801675, + "grad_norm": 0.17178453068271043, + "kl": 0.010117340087890624, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 56485356.0, + "reward": 0.8532873392105103, + "reward_std": 0.07009301483631133, + "rewards/format_reward/mean": 0.999609375, + "rewards/format_reward/std": 0.00883883461356163, + "rewards/qatch_metrics/mean": 0.8274485826492309, + "rewards/qatch_metrics/std": 0.31240676045417787, + "rewards/tag_count_reward/mean": 0.99990234375, + "rewards/tag_count_reward/std": 0.0022097086533904076, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.6, + "completions/max_terminated_length": 459.6, + "completions/mean_length": 220.95234375, + "completions/mean_terminated_length": 220.95234375, + "completions/min_length": 68.6, + "completions/min_terminated_length": 68.6, + "epoch": 0.9519612163948876, + "grad_norm": 0.15364550208264494, + "kl": 0.00984039306640625, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 57953010.0, + "reward": 0.868242597579956, + "reward_std": 0.06916632130742073, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8449912905693054, + "rewards/qatch_metrics/std": 0.2899660974740982, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.2, + "completions/max_terminated_length": 423.2, + "completions/mean_length": 225.621875, + "completions/mean_terminated_length": 225.621875, + "completions/min_length": 88.8, + "completions/min_terminated_length": 88.8, + "epoch": 0.48479506390480387, + "grad_norm": 0.17697767584196022, + "kl": 0.00970916748046875, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 58736110.0, + "reward": 0.8460039258003235, + "reward_std": 0.055821475386619565, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8188281297683716, + "rewards/qatch_metrics/std": 0.30660555958747865, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 223.92734375, + "completions/mean_terminated_length": 223.92734375, + "completions/min_length": 77.8, + "completions/min_terminated_length": 77.8, + "epoch": 0.4936095196121639, + "grad_norm": 0.2692630701899735, + "kl": 0.0131378173828125, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 59498897.0, + "reward": 0.7988754034042358, + "reward_std": 0.08376505076885224, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7633828163146973, + "rewards/qatch_metrics/std": 0.3335907101631165, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.8, + "completions/max_terminated_length": 434.8, + "completions/mean_length": 223.48125, + "completions/mean_terminated_length": 223.48125, + "completions/min_length": 83.2, + "completions/min_terminated_length": 83.2, + "epoch": 0.502423975319524, + "grad_norm": 0.2666009697829767, + "kl": 0.0107269287109375, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 60277897.0, + "reward": 0.7720089554786682, + "reward_std": 0.0594131164252758, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7317867279052734, + "rewards/qatch_metrics/std": 0.33845625519752504, + "rewards/tag_count_reward/mean": 0.9998046875, + "rewards/tag_count_reward/std": 0.003125, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 219.62421875, + "completions/mean_terminated_length": 219.62421875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.511238431026884, + "grad_norm": 0.16876063412105669, + "kl": 0.01141357421875, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 61033560.0, + "reward": 0.7902166962623596, + "reward_std": 0.0687429528683424, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7531961083412171, + "rewards/qatch_metrics/std": 0.37054654359817507, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.2, + "completions/max_terminated_length": 471.2, + "completions/mean_length": 226.275, + "completions/mean_terminated_length": 226.275, + "completions/min_length": 80.6, + "completions/min_terminated_length": 80.6, + "epoch": 0.5200528867342442, + "grad_norm": 0.26818466602074054, + "kl": 0.0130706787109375, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 61786008.0, + "reward": 0.7699209451675415, + "reward_std": 0.07550354823470115, + "rewards/format_reward/mean": 0.99921875, + "rewards/format_reward/std": 0.0125, + "rewards/qatch_metrics/mean": 0.7294221520423889, + "rewards/qatch_metrics/std": 0.3492735385894775, + "rewards/tag_count_reward/mean": 0.9998046875, + "rewards/tag_count_reward/std": 0.003125, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.6, + "completions/max_terminated_length": 484.6, + "completions/mean_length": 247.2328125, + "completions/mean_terminated_length": 247.2328125, + "completions/min_length": 95.8, + "completions/min_terminated_length": 95.8, + "epoch": 0.5288673424416043, + "grad_norm": 0.16485515882678206, + "kl": 0.0113006591796875, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 62590434.0, + "reward": 0.8454334974288941, + "reward_std": 0.0570029616355896, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8181570172309875, + "rewards/qatch_metrics/std": 0.2992805689573288, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.6, + "completions/max_terminated_length": 480.6, + "completions/mean_length": 235.16015625, + "completions/mean_terminated_length": 235.16015625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.5376817981489643, + "grad_norm": 0.27561378534620606, + "kl": 0.01141510009765625, + "learning_rate": 1e-06, + "loss": 0.0097, + "num_tokens": 63366287.0, + "reward": 0.8380108118057251, + "reward_std": 0.07530387155711651, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8094244718551635, + "rewards/qatch_metrics/std": 0.30977231860160825, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.6, + "completions/max_terminated_length": 458.6, + "completions/mean_length": 215.646875, + "completions/mean_terminated_length": 215.646875, + "completions/min_length": 79.4, + "completions/min_terminated_length": 79.4, + "epoch": 0.5464962538563244, + "grad_norm": 0.2018916915779266, + "kl": 0.013714599609375, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 64097387.0, + "reward": 0.8135073184967041, + "reward_std": 0.05950811579823494, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7806198120117187, + "rewards/qatch_metrics/std": 0.33523867428302767, + "rewards/tag_count_reward/mean": 0.999609375, + "rewards/tag_count_reward/std": 0.00625, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.4, + "completions/max_terminated_length": 447.4, + "completions/mean_length": 223.68515625, + "completions/mean_terminated_length": 223.68515625, + "completions/min_length": 92.4, + "completions/min_terminated_length": 92.4, + "epoch": 0.5553107095636844, + "grad_norm": 0.1836962735356692, + "kl": 0.0138214111328125, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 64869416.0, + "reward": 0.8333834052085877, + "reward_std": 0.07006162852048874, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8039804816246032, + "rewards/qatch_metrics/std": 0.3219245493412018, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.8, + "completions/max_terminated_length": 504.8, + "completions/mean_length": 221.45390625, + "completions/mean_terminated_length": 221.45390625, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.5641251652710445, + "grad_norm": 0.23250178423343035, + "kl": 0.01497802734375, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 65613165.0, + "reward": 0.8320096850395202, + "reward_std": 0.053499556705355646, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8023643255233764, + "rewards/qatch_metrics/std": 0.3343039393424988, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.6, + "completions/max_terminated_length": 466.6, + "completions/mean_length": 220.3984375, + "completions/mean_terminated_length": 220.3984375, + "completions/min_length": 72.6, + "completions/min_terminated_length": 72.6, + "epoch": 0.5729396209784046, + "grad_norm": 0.09740281424559781, + "kl": 0.0155609130859375, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 66336475.0, + "reward": 0.8796087980270386, + "reward_std": 0.05236431676894426, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8583632946014405, + "rewards/qatch_metrics/std": 0.2817832052707672, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.4, + "completions/max_terminated_length": 459.4, + "completions/mean_length": 225.27890625, + "completions/mean_terminated_length": 225.27890625, + "completions/min_length": 78.6, + "completions/min_terminated_length": 78.6, + "epoch": 0.5817540766857646, + "grad_norm": 0.08354955287926201, + "kl": 0.01513671875, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 67098736.0, + "reward": 0.8658102512359619, + "reward_std": 0.07466748803853988, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8421296954154969, + "rewards/qatch_metrics/std": 0.2614422976970673, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.2, + "completions/max_terminated_length": 447.2, + "completions/mean_length": 220.01875, + "completions/mean_terminated_length": 220.01875, + "completions/min_length": 81.8, + "completions/min_terminated_length": 81.8, + "epoch": 0.5905685323931247, + "grad_norm": 0.20574209747901576, + "kl": 0.015081787109375, + "learning_rate": 1e-06, + "loss": 0.011, + "num_tokens": 67847928.0, + "reward": 0.865822184085846, + "reward_std": 0.046268445625901225, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8421437621116639, + "rewards/qatch_metrics/std": 0.29589260220527647, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 213.5953125, + "completions/mean_terminated_length": 213.5953125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.5993829881004848, + "grad_norm": 0.2039975034177896, + "kl": 0.0161651611328125, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 68585234.0, + "reward": 0.8343551635742188, + "reward_std": 0.0688902921974659, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8051237106323242, + "rewards/qatch_metrics/std": 0.30847290754318235, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 203.69453125, + "completions/mean_terminated_length": 203.69453125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.6081974438078449, + "grad_norm": 0.26848084439203446, + "kl": 0.014788818359375, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 69338379.0, + "reward": 0.8848124146461487, + "reward_std": 0.06373886093497276, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8644851684570313, + "rewards/qatch_metrics/std": 0.26705425381660464, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.8, + "completions/max_terminated_length": 446.8, + "completions/mean_length": 221.1046875, + "completions/mean_terminated_length": 221.1046875, + "completions/min_length": 79.4, + "completions/min_terminated_length": 79.4, + "epoch": 0.617011899515205, + "grad_norm": 0.2363792510293019, + "kl": 0.019024658203125, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 70095473.0, + "reward": 0.8130708336830139, + "reward_std": 0.08477363213896752, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7800833344459533, + "rewards/qatch_metrics/std": 0.3211198329925537, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.4, + "completions/max_terminated_length": 458.4, + "completions/mean_length": 234.00078125, + "completions/mean_terminated_length": 234.00078125, + "completions/min_length": 91.4, + "completions/min_terminated_length": 91.4, + "epoch": 0.625826355222565, + "grad_norm": 0.1856420640121193, + "kl": 0.019122314453125, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 70860290.0, + "reward": 0.8471660256385803, + "reward_std": 0.0506692998111248, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8201953172683716, + "rewards/qatch_metrics/std": 0.30663308799266814, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.8, + "completions/max_terminated_length": 452.8, + "completions/mean_length": 241.72421875, + "completions/mean_terminated_length": 241.72421875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.6346408109299251, + "grad_norm": 0.22939974521057024, + "kl": 0.01826171875, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 71648401.0, + "reward": 0.8702264785766601, + "reward_std": 0.0592925101518631, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8473252534866333, + "rewards/qatch_metrics/std": 0.28537269234657286, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.6, + "completions/max_terminated_length": 434.6, + "completions/mean_length": 215.853125, + "completions/mean_terminated_length": 215.853125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.6434552666372851, + "grad_norm": 0.19883621919511643, + "kl": 0.0163330078125, + "learning_rate": 1e-06, + "loss": 0.0073, + "num_tokens": 72382693.0, + "reward": 0.8091506719589233, + "reward_std": 0.0635421834886074, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7754713773727417, + "rewards/qatch_metrics/std": 0.3179103255271912, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.8, + "completions/max_terminated_length": 496.8, + "completions/mean_length": 213.1640625, + "completions/mean_terminated_length": 213.1640625, + "completions/min_length": 76.2, + "completions/min_terminated_length": 76.2, + "epoch": 0.6522697223446452, + "grad_norm": 0.1916457590662772, + "kl": 0.0175506591796875, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 73161111.0, + "reward": 0.8094798445701599, + "reward_std": 0.04875086285173893, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7758586168289184, + "rewards/qatch_metrics/std": 0.32606661319732666, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.4, + "completions/max_terminated_length": 480.4, + "completions/mean_length": 222.75, + "completions/mean_terminated_length": 222.75, + "completions/min_length": 72.6, + "completions/min_terminated_length": 72.6, + "epoch": 0.6610841780520053, + "grad_norm": 0.15787517122504152, + "kl": 0.0181884765625, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 73905591.0, + "reward": 0.89048171043396, + "reward_std": 0.04932568361982703, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8711549639701843, + "rewards/qatch_metrics/std": 0.2736783862113953, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.4, + "completions/max_terminated_length": 561.4, + "completions/mean_length": 234.4390625, + "completions/mean_terminated_length": 234.4390625, + "completions/min_length": 75.2, + "completions/min_terminated_length": 75.2, + "epoch": 0.6698986337593653, + "grad_norm": 0.2653930596733297, + "kl": 0.0174713134765625, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 74679801.0, + "reward": 0.8243065714836121, + "reward_std": 0.06958894729614258, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7933018207550049, + "rewards/qatch_metrics/std": 0.3086866676807404, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.8, + "completions/max_terminated_length": 511.8, + "completions/mean_length": 243.73359375, + "completions/mean_terminated_length": 243.73359375, + "completions/min_length": 81.6, + "completions/min_terminated_length": 81.6, + "epoch": 0.6787130894667255, + "grad_norm": 0.20233916054675122, + "kl": 0.014093017578125, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 75445892.0, + "reward": 0.8653998494148254, + "reward_std": 0.07132081612944603, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8416468739509583, + "rewards/qatch_metrics/std": 0.3147186517715454, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.4, + "completions/max_terminated_length": 447.4, + "completions/mean_length": 228.43671875, + "completions/mean_terminated_length": 228.43671875, + "completions/min_length": 79.8, + "completions/min_terminated_length": 79.8, + "epoch": 0.6875275451740855, + "grad_norm": 0.29996778931865303, + "kl": 0.0146087646484375, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 76229251.0, + "reward": 0.8502862334251404, + "reward_std": 0.07314281612634659, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8238661646842956, + "rewards/qatch_metrics/std": 0.3113024443387985, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 249.8734375, + "completions/mean_terminated_length": 249.8734375, + "completions/min_length": 84.4, + "completions/min_terminated_length": 84.4, + "epoch": 0.6963420008814456, + "grad_norm": 0.2150032953896288, + "kl": 0.017156982421875, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 77021793.0, + "reward": 0.8494030237197876, + "reward_std": 0.05776047557592392, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8228270888328553, + "rewards/qatch_metrics/std": 0.3020846724510193, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 471.4, + "completions/max_terminated_length": 471.4, + "completions/mean_length": 247.9828125, + "completions/mean_terminated_length": 247.9828125, + "completions/min_length": 84.4, + "completions/min_terminated_length": 84.4, + "epoch": 0.7051564565888057, + "grad_norm": 0.2754041387856829, + "kl": 0.0148590087890625, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 77833451.0, + "reward": 0.8363431453704834, + "reward_std": 0.06054745838046074, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.807462501525879, + "rewards/qatch_metrics/std": 0.29668720066547394, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.6, + "completions/max_terminated_length": 442.6, + "completions/mean_length": 225.4296875, + "completions/mean_terminated_length": 225.4296875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.7139709122961657, + "grad_norm": 0.22420011771594078, + "kl": 0.017706298828125, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 78585793.0, + "reward": 0.8382049560546875, + "reward_std": 0.05150428526103497, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8096528768539428, + "rewards/qatch_metrics/std": 0.2925006330013275, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.4, + "completions/max_terminated_length": 618.4, + "completions/mean_length": 219.3421875, + "completions/mean_terminated_length": 219.3421875, + "completions/min_length": 84.6, + "completions/min_terminated_length": 84.6, + "epoch": 0.7227853680035258, + "grad_norm": 0.0986589707089894, + "kl": 0.0170196533203125, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 79352135.0, + "reward": 0.8465274453163147, + "reward_std": 0.05231629386544227, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8194440126419067, + "rewards/qatch_metrics/std": 0.3004340440034866, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.6, + "completions/max_terminated_length": 470.6, + "completions/mean_length": 213.9921875, + "completions/mean_terminated_length": 213.9921875, + "completions/min_length": 83.6, + "completions/min_terminated_length": 83.6, + "epoch": 0.7315998237108858, + "grad_norm": 0.17969166348358623, + "kl": 0.01600341796875, + "learning_rate": 1e-06, + "loss": 0.0196, + "num_tokens": 80093021.0, + "reward": 0.7899853944778442, + "reward_std": 0.06183199286460876, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7529239773750305, + "rewards/qatch_metrics/std": 0.32831716537475586, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.4, + "completions/max_terminated_length": 458.4, + "completions/mean_length": 208.25390625, + "completions/mean_terminated_length": 208.25390625, + "completions/min_length": 72.6, + "completions/min_terminated_length": 72.6, + "epoch": 0.7404142794182459, + "grad_norm": 0.12360613268228073, + "kl": 0.0170166015625, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 80810658.0, + "reward": 0.8781363725662231, + "reward_std": 0.04314489997923374, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8566309928894043, + "rewards/qatch_metrics/std": 0.2832080274820328, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.2, + "completions/max_terminated_length": 447.2, + "completions/mean_length": 203.98046875, + "completions/mean_terminated_length": 203.98046875, + "completions/min_length": 79.2, + "completions/min_terminated_length": 79.2, + "epoch": 0.749228735125606, + "grad_norm": 0.210810313322166, + "kl": 0.0164581298828125, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 81548361.0, + "reward": 0.8270991563796997, + "reward_std": 0.06941422820091248, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7965872406959533, + "rewards/qatch_metrics/std": 0.33117216229438784, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 434.6, + "completions/max_terminated_length": 434.6, + "completions/mean_length": 220.75703125, + "completions/mean_terminated_length": 220.75703125, + "completions/min_length": 80.4, + "completions/min_terminated_length": 80.4, + "epoch": 0.7580431908329661, + "grad_norm": 0.21910688267881026, + "kl": 0.016754150390625, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 82290706.0, + "reward": 0.8464880228042603, + "reward_std": 0.04884184449911118, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8193976640701294, + "rewards/qatch_metrics/std": 0.28375020921230315, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.8, + "completions/max_terminated_length": 451.8, + "completions/mean_length": 223.1234375, + "completions/mean_terminated_length": 223.1234375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.7668576465403262, + "grad_norm": 0.26253720274856984, + "kl": 0.0178009033203125, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 83056976.0, + "reward": 0.8096219301223755, + "reward_std": 0.07494284212589264, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7760257959365845, + "rewards/qatch_metrics/std": 0.3492628037929535, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.2, + "completions/max_terminated_length": 501.2, + "completions/mean_length": 216.840625, + "completions/mean_terminated_length": 216.840625, + "completions/min_length": 88.8, + "completions/min_terminated_length": 88.8, + "epoch": 0.7756721022476862, + "grad_norm": 0.27647079947407377, + "kl": 0.0181732177734375, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 83805044.0, + "reward": 0.7776495218276978, + "reward_std": 0.056884029135108, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7384112119674683, + "rewards/qatch_metrics/std": 0.3683965981006622, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.0, + "completions/max_terminated_length": 455.0, + "completions/mean_length": 216.46015625, + "completions/mean_terminated_length": 216.46015625, + "completions/min_length": 78.2, + "completions/min_terminated_length": 78.2, + "epoch": 0.7844865579550463, + "grad_norm": 0.20996305667402082, + "kl": 0.0163116455078125, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 84571313.0, + "reward": 0.8477118849754334, + "reward_std": 0.06959039457142353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8208375215530396, + "rewards/qatch_metrics/std": 0.30095059871673585, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.4, + "completions/max_terminated_length": 443.4, + "completions/mean_length": 211.00234375, + "completions/mean_terminated_length": 211.00234375, + "completions/min_length": 86.2, + "completions/min_terminated_length": 86.2, + "epoch": 0.7933010136624064, + "grad_norm": 0.15662206787116065, + "kl": 0.0160797119140625, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 85319188.0, + "reward": 0.8328658938407898, + "reward_std": 0.05801869332790375, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8033716320991516, + "rewards/qatch_metrics/std": 0.3037038058042526, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.2, + "completions/max_terminated_length": 475.2, + "completions/mean_length": 222.7078125, + "completions/mean_terminated_length": 222.7078125, + "completions/min_length": 80.8, + "completions/min_terminated_length": 80.8, + "epoch": 0.8021154693697664, + "grad_norm": 0.19919629119501958, + "kl": 0.01639404296875, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 86091774.0, + "reward": 0.8358211517333984, + "reward_std": 0.0607087716460228, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8068713665008544, + "rewards/qatch_metrics/std": 0.30334635376930236, + "rewards/tag_count_reward/mean": 0.999609375, + "rewards/tag_count_reward/std": 0.00625, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.4, + "completions/max_terminated_length": 418.4, + "completions/mean_length": 221.0015625, + "completions/mean_terminated_length": 221.0015625, + "completions/min_length": 80.4, + "completions/min_terminated_length": 80.4, + "epoch": 0.8109299250771265, + "grad_norm": 0.1419366062228353, + "kl": 0.01617431640625, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 86848528.0, + "reward": 0.8028954148292542, + "reward_std": 0.06934207193553447, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7681122660636902, + "rewards/qatch_metrics/std": 0.3390295565128326, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.6, + "completions/max_terminated_length": 426.6, + "completions/mean_length": 210.6140625, + "completions/mean_terminated_length": 210.6140625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.8197443807844865, + "grad_norm": 0.16116384181364513, + "kl": 0.0162078857421875, + "learning_rate": 1e-06, + "loss": 0.013, + "num_tokens": 87564482.0, + "reward": 0.8424649000167846, + "reward_std": 0.040234316140413284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8146645903587342, + "rewards/qatch_metrics/std": 0.2840981811285019, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 447.6, + "completions/max_terminated_length": 447.6, + "completions/mean_length": 195.89140625, + "completions/mean_terminated_length": 195.89140625, + "completions/min_length": 80.8, + "completions/min_terminated_length": 80.8, + "epoch": 0.8285588364918466, + "grad_norm": 0.21075371504226795, + "kl": 0.0193115234375, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 88255159.0, + "reward": 0.8565711379051208, + "reward_std": 0.06344871073961258, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8312601566314697, + "rewards/qatch_metrics/std": 0.3075568675994873, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.4, + "completions/max_terminated_length": 445.4, + "completions/mean_length": 201.01484375, + "completions/mean_terminated_length": 201.01484375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.8373732921992068, + "grad_norm": 0.27204162033836665, + "kl": 0.019976806640625, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 88945690.0, + "reward": 0.8785177230834961, + "reward_std": 0.06470721438527108, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8570796966552734, + "rewards/qatch_metrics/std": 0.2825317859649658, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.4, + "completions/max_terminated_length": 497.4, + "completions/mean_length": 221.90859375, + "completions/mean_terminated_length": 221.90859375, + "completions/min_length": 81.2, + "completions/min_terminated_length": 81.2, + "epoch": 0.8461877479065668, + "grad_norm": 0.19323853705899263, + "kl": 0.0183746337890625, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 89712373.0, + "reward": 0.8555493712425232, + "reward_std": 0.06230065375566483, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8300580739974975, + "rewards/qatch_metrics/std": 0.28706649839878084, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.8, + "completions/max_terminated_length": 498.8, + "completions/mean_length": 228.7890625, + "completions/mean_terminated_length": 228.7890625, + "completions/min_length": 88.4, + "completions/min_terminated_length": 88.4, + "epoch": 0.8550022036139269, + "grad_norm": 0.24770714763886528, + "kl": 0.0176513671875, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 90520071.0, + "reward": 0.8527018785476684, + "reward_std": 0.062195781618356705, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8267080783843994, + "rewards/qatch_metrics/std": 0.2996180385351181, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.6, + "completions/max_terminated_length": 480.6, + "completions/mean_length": 227.66875, + "completions/mean_terminated_length": 227.66875, + "completions/min_length": 83.2, + "completions/min_terminated_length": 83.2, + "epoch": 0.8638166593212869, + "grad_norm": 0.16162980170931898, + "kl": 0.0188812255859375, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 91278479.0, + "reward": 0.8309607028961181, + "reward_std": 0.0656251635402441, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8011302351951599, + "rewards/qatch_metrics/std": 0.31802850365638735, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.4, + "completions/max_terminated_length": 502.4, + "completions/mean_length": 225.3171875, + "completions/mean_terminated_length": 225.3171875, + "completions/min_length": 80.4, + "completions/min_terminated_length": 80.4, + "epoch": 0.872631115028647, + "grad_norm": 0.1886973597841831, + "kl": 0.01859130859375, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 92033173.0, + "reward": 0.8441248655319213, + "reward_std": 0.043570340052247046, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8166174769401551, + "rewards/qatch_metrics/std": 0.30278873145580293, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.6, + "completions/max_terminated_length": 443.6, + "completions/mean_length": 234.475, + "completions/mean_terminated_length": 234.475, + "completions/min_length": 99.8, + "completions/min_terminated_length": 99.8, + "epoch": 0.881445570736007, + "grad_norm": 0.24444756963754977, + "kl": 0.01798095703125, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 92808293.0, + "reward": 0.8517020106315613, + "reward_std": 0.06295906975865365, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.825531804561615, + "rewards/qatch_metrics/std": 0.3100520223379135, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 215.0328125, + "completions/mean_terminated_length": 215.0328125, + "completions/min_length": 84.2, + "completions/min_terminated_length": 84.2, + "epoch": 0.8902600264433671, + "grad_norm": 0.21103775626066984, + "kl": 0.0171600341796875, + "learning_rate": 1e-06, + "loss": 0.0051, + "num_tokens": 93563327.0, + "reward": 0.8682243466377259, + "reward_std": 0.04365142099559307, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8449697852134704, + "rewards/qatch_metrics/std": 0.2696381151676178, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.4, + "completions/max_terminated_length": 463.4, + "completions/mean_length": 218.59453125, + "completions/mean_terminated_length": 218.59453125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.8990744821507272, + "grad_norm": 0.20107359914643413, + "kl": 0.016455078125, + "learning_rate": 1e-06, + "loss": 0.0086, + "num_tokens": 94333288.0, + "reward": 0.8064153909683227, + "reward_std": 0.06192653328180313, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.772253406047821, + "rewards/qatch_metrics/std": 0.3227865040302277, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 206.5703125, + "completions/mean_terminated_length": 206.5703125, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.9078889378580872, + "grad_norm": 0.10741725097461949, + "kl": 0.0163330078125, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 95051890.0, + "reward": 0.8839513182640075, + "reward_std": 0.04564618114382028, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8634721517562867, + "rewards/qatch_metrics/std": 0.24794530421495437, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.2, + "completions/max_terminated_length": 420.2, + "completions/mean_length": 193.584375, + "completions/mean_terminated_length": 193.584375, + "completions/min_length": 74.8, + "completions/min_terminated_length": 74.8, + "epoch": 0.9167033935654474, + "grad_norm": 0.3417922303720187, + "kl": 0.0196563720703125, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 95755150.0, + "reward": 0.8428452134132385, + "reward_std": 0.05727057494223118, + "rewards/format_reward/mean": 0.99921875, + "rewards/format_reward/std": 0.0125, + "rewards/qatch_metrics/mean": 0.8152039051055908, + "rewards/qatch_metrics/std": 0.31376497745513915, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.8, + "completions/max_terminated_length": 404.8, + "completions/mean_length": 208.72890625, + "completions/mean_terminated_length": 208.72890625, + "completions/min_length": 72.2, + "completions/min_terminated_length": 72.2, + "epoch": 0.9255178492728074, + "grad_norm": 0.17161657062686406, + "kl": 0.0185943603515625, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 96514835.0, + "reward": 0.8597602009773254, + "reward_std": 0.044371549785137174, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8350119948387146, + "rewards/qatch_metrics/std": 0.295586758852005, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.8, + "completions/max_terminated_length": 426.8, + "completions/mean_length": 212.95859375, + "completions/mean_terminated_length": 212.95859375, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.9343323049801675, + "grad_norm": 0.22162383692372334, + "kl": 0.0186981201171875, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 97270782.0, + "reward": 0.8363440155982971, + "reward_std": 0.06691965609788894, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8074635624885559, + "rewards/qatch_metrics/std": 0.3064163327217102, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 454.6, + "completions/max_terminated_length": 454.6, + "completions/mean_length": 233.40234375, + "completions/mean_terminated_length": 233.40234375, + "completions/min_length": 76.4, + "completions/min_terminated_length": 76.4, + "epoch": 0.9431467606875276, + "grad_norm": 0.1434511776519399, + "kl": 0.019879150390625, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 98016705.0, + "reward": 0.8363542199134827, + "reward_std": 0.05200971700251102, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8074755430221557, + "rewards/qatch_metrics/std": 0.2885085940361023, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.6, + "completions/max_terminated_length": 444.6, + "completions/mean_length": 235.70625, + "completions/mean_terminated_length": 235.70625, + "completions/min_length": 80.6, + "completions/min_terminated_length": 80.6, + "epoch": 0.9519612163948876, + "grad_norm": 0.09221258199209693, + "kl": 0.018701171875, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 98787193.0, + "reward": 0.8677037119865417, + "reward_std": 0.057669999450445174, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8443572998046875, + "rewards/qatch_metrics/std": 0.288933590054512, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 455.2, + "completions/max_terminated_length": 455.2, + "completions/mean_length": 222.11875, + "completions/mean_terminated_length": 222.11875, + "completions/min_length": 74.6, + "completions/min_terminated_length": 74.6, + "epoch": 0.9607756721022477, + "grad_norm": 0.1352237905149159, + "kl": 0.018145751953125, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 99532081.0, + "reward": 0.8805891752243042, + "reward_std": 0.05483146589249373, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8595166802406311, + "rewards/qatch_metrics/std": 0.25585181415081026, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.6, + "completions/max_terminated_length": 472.6, + "completions/mean_length": 218.659375, + "completions/mean_terminated_length": 218.659375, + "completions/min_length": 86.2, + "completions/min_terminated_length": 86.2, + "epoch": 0.9695901278096077, + "grad_norm": 0.16904630982662794, + "kl": 0.01783447265625, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 100246573.0, + "reward": 0.8569401383399964, + "reward_std": 0.07272802218794823, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8316942691802979, + "rewards/qatch_metrics/std": 0.3041912466287613, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.8, + "completions/max_terminated_length": 459.8, + "completions/mean_length": 221.78984375, + "completions/mean_terminated_length": 221.78984375, + "completions/min_length": 77.8, + "completions/min_terminated_length": 77.8, + "epoch": 0.9784045835169678, + "grad_norm": 0.31854687165087076, + "kl": 0.0183258056640625, + "learning_rate": 1e-06, + "loss": -0.0058, + "num_tokens": 100996640.0, + "reward": 0.8102917551994324, + "reward_std": 0.07570969834923744, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.7768138289451599, + "rewards/qatch_metrics/std": 0.34436498284339906, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.6, + "completions/max_terminated_length": 490.6, + "completions/mean_length": 230.28828125, + "completions/mean_terminated_length": 230.28828125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.9872190392243279, + "grad_norm": 0.16545798735816303, + "kl": 0.01719970703125, + "learning_rate": 1e-06, + "loss": 0.0054, + "num_tokens": 101777473.0, + "reward": 0.854366683959961, + "reward_std": 0.050544672086834906, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.8286666750907898, + "rewards/qatch_metrics/std": 0.3027670204639435, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 232.278125, + "completions/mean_terminated_length": 232.278125, + "completions/min_length": 79.2, + "completions/min_terminated_length": 79.2, + "epoch": 0.996033494931688, + "grad_norm": 0.2064718967348405, + "kl": 0.020306396484375, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 102547669.0, + "reward": 0.7918175339698792, + "reward_std": 0.05684706475585699, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.755079448223114, + "rewards/qatch_metrics/std": 0.3250477254390717, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.5, + "completions/max_terminated_length": 468.5, + "completions/mean_length": 214.265625, + "completions/mean_terminated_length": 214.265625, + "completions/min_length": 66.0, + "completions/min_terminated_length": 66.0, + "epoch": 0.999559277214632, + "kl": 0.01806640625, + "num_tokens": 102823629.0, + "reward": 0.8797399699687958, + "reward_std": 0.056224397383630276, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.858517587184906, + "rewards/qatch_metrics/std": 0.26497258245944977, + "rewards/tag_count_reward/mean": 1.0, + "rewards/tag_count_reward/std": 0.0, + "step": 567, + "total_flos": 0.0, + "train_loss": -1.6490349831877564e-05, + "train_runtime": 5804.9117, + "train_samples_per_second": 1.564, + "train_steps_per_second": 0.098 + } + ], + "logging_steps": 5, + "max_steps": 567, + "num_input_tokens_seen": 102823629, + "num_train_epochs": 1, + "save_steps": 5, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}