{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999559277214632, "eval_steps": 500, "global_step": 567, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 175.50390625, "completions/mean_terminated_length": 175.50390625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.0017628911414720142, "grad_norm": 1.0880173896572545, "kl": 0.0, "learning_rate": 0.0, "loss": -0.327, "num_tokens": 129409.0, "reward": 0.814777672290802, "reward_std": 0.14736539125442505, "rewards/format_reward/mean": 0.68359375, "rewards/format_reward/std": 0.4659844934940338, "rewards/qatch_metrics/mean": 0.8332747220993042, "rewards/qatch_metrics/std": 0.3284282088279724, "rewards/tag_count_reward/mean": 0.7626953125, "rewards/tag_count_reward/std": 0.34948837757110596, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 177.318359375, "completions/mean_terminated_length": 177.318359375, "completions/min_length": 21.5, "completions/min_terminated_length": 21.5, "epoch": 0.00881445570736007, "grad_norm": 0.9499188530188546, "kl": 0.00019824504852294922, "learning_rate": 7.017543859649122e-08, "loss": -0.2902, "num_tokens": 685703.0, "reward": 0.762174516916275, "reward_std": 0.15002675727009773, "rewards/format_reward/mean": 0.7265625, "rewards/format_reward/std": 0.4450720399618149, "rewards/qatch_metrics/mean": 0.7644235193729401, "rewards/qatch_metrics/std": 0.3610532283782959, "rewards/tag_count_reward/mean": 0.795166015625, "rewards/tag_count_reward/std": 0.33385463058948517, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.8, "completions/max_terminated_length": 438.8, "completions/mean_length": 173.41171875, "completions/mean_terminated_length": 173.41171875, "completions/min_length": 21.8, "completions/min_terminated_length": 21.8, "epoch": 0.01762891141472014, "grad_norm": 0.9346895582900878, "kl": 0.00028295516967773436, "learning_rate": 1.5789473684210525e-07, "loss": -0.2591, "num_tokens": 1398566.0, "reward": 0.7710299372673035, "reward_std": 0.1539353460073471, "rewards/format_reward/mean": 0.71796875, "rewards/format_reward/std": 0.4487275779247284, "rewards/qatch_metrics/mean": 0.7762346506118775, "rewards/qatch_metrics/std": 0.3281721532344818, "rewards/tag_count_reward/mean": 0.788671875, "rewards/tag_count_reward/std": 0.33627479076385497, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.2, "completions/max_terminated_length": 438.2, "completions/mean_length": 183.1796875, "completions/mean_terminated_length": 183.1796875, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.026443367122080213, "grad_norm": 0.7943318239924386, "kl": 0.00037631988525390627, "learning_rate": 2.456140350877193e-07, "loss": -0.2603, "num_tokens": 2071996.0, "reward": 0.7256837129592896, "reward_std": 0.12991088777780532, "rewards/format_reward/mean": 0.765625, "rewards/format_reward/std": 0.4240167737007141, "rewards/qatch_metrics/mean": 0.7151770830154419, "rewards/qatch_metrics/std": 0.37596395611763, "rewards/tag_count_reward/mean": 0.8244140625, "rewards/tag_count_reward/std": 0.31790287494659425, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.6, "completions/max_terminated_length": 479.6, "completions/mean_length": 201.30234375, "completions/mean_terminated_length": 201.30234375, "completions/min_length": 21.2, "completions/min_terminated_length": 21.2, "epoch": 0.03525782282944028, "grad_norm": 0.4721344642723057, "kl": 0.00091400146484375, "learning_rate": 3.333333333333333e-07, "loss": -0.1315, "num_tokens": 2791247.0, "reward": 0.8173989057540894, "reward_std": 0.12794919013977052, "rewards/format_reward/mean": 0.89765625, "rewards/format_reward/std": 0.29814977645874025, "rewards/qatch_metrics/mean": 0.8017192721366883, "rewards/qatch_metrics/std": 0.331482595205307, "rewards/tag_count_reward/mean": 0.9234375, "rewards/tag_count_reward/std": 0.22307254374027252, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.2, "completions/max_terminated_length": 432.2, "completions/mean_length": 221.4625, "completions/mean_terminated_length": 221.4625, "completions/min_length": 51.4, "completions/min_terminated_length": 51.4, "epoch": 0.044072278536800354, "grad_norm": 0.29592079815815686, "kl": 0.0016038894653320312, "learning_rate": 4.2105263157894733e-07, "loss": -0.0424, "num_tokens": 3536975.0, "reward": 0.7564297676086426, "reward_std": 0.08200130835175515, "rewards/format_reward/mean": 0.96953125, "rewards/format_reward/std": 0.13422587364912034, "rewards/qatch_metrics/mean": 0.7183640837669373, "rewards/qatch_metrics/std": 0.3674669623374939, "rewards/tag_count_reward/mean": 0.97734375, "rewards/tag_count_reward/std": 0.09909781143069267, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.6, "completions/max_terminated_length": 445.6, "completions/mean_length": 216.53984375, "completions/mean_terminated_length": 216.53984375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.052886734244160426, "grad_norm": 0.275794455786416, "kl": 0.0034694671630859375, "learning_rate": 5.087719298245614e-07, "loss": 0.002, "num_tokens": 4281330.0, "reward": 0.7764788866043091, "reward_std": 0.09769791960716248, "rewards/format_reward/mean": 0.9953125, "rewards/format_reward/std": 0.06028594672679901, "rewards/qatch_metrics/mean": 0.7377692699432373, "rewards/qatch_metrics/std": 0.3548368811607361, "rewards/tag_count_reward/mean": 0.996875, "rewards/tag_count_reward/std": 0.04124387204647064, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.8, "completions/max_terminated_length": 445.8, "completions/mean_length": 220.11796875, "completions/mean_terminated_length": 220.11796875, "completions/min_length": 59.8, "completions/min_terminated_length": 59.8, "epoch": 0.06170118995152049, "grad_norm": 0.2691159080285212, "kl": 0.005501174926757812, "learning_rate": 5.964912280701754e-07, "loss": -0.0083, "num_tokens": 5008025.0, "reward": 0.8268720507621765, "reward_std": 0.08243840038776398, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/qatch_metrics/mean": 0.7969059944152832, "rewards/qatch_metrics/std": 0.30500164330005647, "rewards/tag_count_reward/mean": 0.9978515625, "rewards/tag_count_reward/std": 0.03437500074505806, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.2, "completions/max_terminated_length": 487.2, "completions/mean_length": 227.76015625, "completions/mean_terminated_length": 227.76015625, "completions/min_length": 83.4, "completions/min_terminated_length": 83.4, "epoch": 0.07051564565888056, "grad_norm": 0.33908836616855625, "kl": 0.002800750732421875, "learning_rate": 6.842105263157895e-07, "loss": 0.0002, "num_tokens": 5774806.0, "reward": 0.7647829532623291, "reward_std": 0.09533883556723595, "rewards/format_reward/mean": 0.9984375, "rewards/format_reward/std": 0.025, "rewards/qatch_metrics/mean": 0.7235268354415894, "rewards/qatch_metrics/std": 0.35323665738105775, "rewards/tag_count_reward/mean": 0.998828125, "rewards/tag_count_reward/std": 0.01875, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.2, "completions/max_terminated_length": 476.2, "completions/mean_length": 221.7984375, "completions/mean_terminated_length": 221.7984375, "completions/min_length": 83.4, "completions/min_terminated_length": 83.4, "epoch": 0.07933010136624064, "grad_norm": 0.3262303740341099, "kl": 0.00310516357421875, "learning_rate": 7.719298245614034e-07, "loss": 0.0104, "num_tokens": 6557268.0, "reward": 0.7565465092658996, "reward_std": 0.09911727011203766, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7135841250419617, "rewards/qatch_metrics/std": 0.37862626910209657, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.8, "completions/max_terminated_length": 512.8, "completions/mean_length": 228.45546875, "completions/mean_terminated_length": 228.45546875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.08814455707360071, "grad_norm": 0.23276410584015308, "kl": 0.00273895263671875, "learning_rate": 8.596491228070175e-07, "loss": -0.0018, "num_tokens": 7327499.0, "reward": 0.7988326072692871, "reward_std": 0.06667622029781342, "rewards/format_reward/mean": 0.9984375, "rewards/format_reward/std": 0.025, "rewards/qatch_metrics/mean": 0.7635622501373291, "rewards/qatch_metrics/std": 0.369570130109787, "rewards/tag_count_reward/mean": 0.99921875, "rewards/tag_count_reward/std": 0.0125, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.8, "completions/max_terminated_length": 483.8, "completions/mean_length": 220.52734375, "completions/mean_terminated_length": 220.52734375, "completions/min_length": 81.2, "completions/min_terminated_length": 81.2, "epoch": 0.09695901278096078, "grad_norm": 0.28218074028465906, "kl": 0.00196533203125, "learning_rate": 9.473684210526315e-07, "loss": -0.0021, "num_tokens": 8077390.0, "reward": 0.8159880757331848, "reward_std": 0.10231453701853752, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7835153818130494, "rewards/qatch_metrics/std": 0.33782891631126405, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.2, "completions/max_terminated_length": 481.2, "completions/mean_length": 223.60703125, "completions/mean_terminated_length": 223.60703125, "completions/min_length": 75.6, "completions/min_terminated_length": 75.6, "epoch": 0.10577346848832085, "grad_norm": 0.23258401790732933, "kl": 0.00223388671875, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 8800407.0, "reward": 0.74871985912323, "reward_std": 0.09312780797481537, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7043763160705566, "rewards/qatch_metrics/std": 0.39227073788642886, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.8, "completions/max_terminated_length": 487.8, "completions/mean_length": 222.81015625, "completions/mean_terminated_length": 222.81015625, "completions/min_length": 77.4, "completions/min_terminated_length": 77.4, "epoch": 0.11458792419568092, "grad_norm": 0.22445170455470606, "kl": 0.002956390380859375, "learning_rate": 1e-06, "loss": 0.0057, "num_tokens": 9557380.0, "reward": 0.8077908515930176, "reward_std": 0.09828853458166123, "rewards/format_reward/mean": 0.9984375, "rewards/format_reward/std": 0.025, "rewards/qatch_metrics/mean": 0.774078369140625, "rewards/qatch_metrics/std": 0.33206661343574523, "rewards/tag_count_reward/mean": 0.999609375, "rewards/tag_count_reward/std": 0.00625, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.6, "completions/max_terminated_length": 492.6, "completions/mean_length": 231.83984375, "completions/mean_terminated_length": 231.83984375, "completions/min_length": 94.6, "completions/min_terminated_length": 94.6, "epoch": 0.12340237990304098, "grad_norm": 0.22832903725685313, "kl": 0.00381317138671875, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 10339127.0, "reward": 0.7895300030708313, "reward_std": 0.10415169298648834, "rewards/format_reward/mean": 0.9984375, "rewards/format_reward/std": 0.025, "rewards/qatch_metrics/mean": 0.7526065230369567, "rewards/qatch_metrics/std": 0.3542828977108002, "rewards/tag_count_reward/mean": 0.9994140625, "rewards/tag_count_reward/std": 0.009375, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.8, "completions/max_terminated_length": 521.8, "completions/mean_length": 236.3125, "completions/mean_terminated_length": 236.3125, "completions/min_length": 80.4, "completions/min_terminated_length": 80.4, "epoch": 0.13221683561040107, "grad_norm": 0.2597151805235052, "kl": 0.00432281494140625, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 11147287.0, "reward": 0.7333161950111389, "reward_std": 0.08832715749740601, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.6862887978553772, "rewards/qatch_metrics/std": 0.36336439847946167, "rewards/tag_count_reward/mean": 0.9994140625, "rewards/tag_count_reward/std": 0.0069767430424690245, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.6, "completions/max_terminated_length": 445.6, "completions/mean_length": 216.43984375, "completions/mean_terminated_length": 216.43984375, "completions/min_length": 87.8, "completions/min_terminated_length": 87.8, "epoch": 0.14103129131776113, "grad_norm": 0.2463929158667687, "kl": 0.00528717041015625, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 11891066.0, "reward": 0.8300724029541016, "reward_std": 0.09615504890680313, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8000851631164551, "rewards/qatch_metrics/std": 0.3208737909793854, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.6, "completions/max_terminated_length": 491.6, "completions/mean_length": 225.32890625, "completions/mean_terminated_length": 225.32890625, "completions/min_length": 86.2, "completions/min_terminated_length": 86.2, "epoch": 0.1498457470251212, "grad_norm": 0.22719354366888944, "kl": 0.005328369140625, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 12668159.0, "reward": 0.816937243938446, "reward_std": 0.08283708170056343, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7846320390701294, "rewards/qatch_metrics/std": 0.32469419240951536, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.4, "completions/max_terminated_length": 460.4, "completions/mean_length": 217.92890625, "completions/mean_terminated_length": 217.92890625, "completions/min_length": 76.2, "completions/min_terminated_length": 76.2, "epoch": 0.15866020273248127, "grad_norm": 0.2721517170479785, "kl": 0.00579071044921875, "learning_rate": 1e-06, "loss": 0.0117, "num_tokens": 13413588.0, "reward": 0.7426301956176757, "reward_std": 0.0905102699995041, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.6972119808197021, "rewards/qatch_metrics/std": 0.37120566368103025, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.6, "completions/max_terminated_length": 428.6, "completions/mean_length": 204.6640625, "completions/mean_terminated_length": 204.6640625, "completions/min_length": 75.6, "completions/min_terminated_length": 75.6, "epoch": 0.16747465843984133, "grad_norm": 0.2525985499058037, "kl": 0.0056243896484375, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 14111606.0, "reward": 0.7979554295539856, "reward_std": 0.06609301418066024, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7623119950294495, "rewards/qatch_metrics/std": 0.34469759464263916, "rewards/tag_count_reward/mean": 0.9998046875, "rewards/tag_count_reward/std": 0.003125, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 212.34765625, "completions/mean_terminated_length": 212.34765625, "completions/min_length": 69.2, "completions/min_terminated_length": 69.2, "epoch": 0.17628911414720141, "grad_norm": 0.30357672091416305, "kl": 0.0057861328125, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 14876659.0, "reward": 0.7724857568740845, "reward_std": 0.09265935122966766, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7323476672172546, "rewards/qatch_metrics/std": 0.33567925691604616, "rewards/tag_count_reward/mean": 0.9998046875, "rewards/tag_count_reward/std": 0.003125, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.4, "completions/max_terminated_length": 463.4, "completions/mean_length": 216.46875, "completions/mean_terminated_length": 216.46875, "completions/min_length": 80.4, "completions/min_terminated_length": 80.4, "epoch": 0.18510356985456147, "grad_norm": 0.23780324977532238, "kl": 0.0056549072265625, "learning_rate": 1e-06, "loss": -0.0087, "num_tokens": 15600331.0, "reward": 0.7508906722068787, "reward_std": 0.0951332688331604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7069302201271057, "rewards/qatch_metrics/std": 0.38108278512954713, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.4, "completions/max_terminated_length": 442.4, "completions/mean_length": 216.578125, "completions/mean_terminated_length": 216.578125, "completions/min_length": 80.2, "completions/min_terminated_length": 80.2, "epoch": 0.19391802556192156, "grad_norm": 0.21716869090526136, "kl": 0.0054229736328125, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 16326015.0, "reward": 0.8402611017227173, "reward_std": 0.05716411247849464, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8120718836784363, "rewards/qatch_metrics/std": 0.2929441839456558, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.2, "completions/max_terminated_length": 428.2, "completions/mean_length": 222.0265625, "completions/mean_terminated_length": 222.0265625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.20273248126928162, "grad_norm": 0.22835452896575356, "kl": 0.0060882568359375, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 17091921.0, "reward": 0.8265595078468323, "reward_std": 0.07398260906338691, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7959523558616638, "rewards/qatch_metrics/std": 0.3277123510837555, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.8, "completions/max_terminated_length": 459.8, "completions/mean_length": 220.7453125, "completions/mean_terminated_length": 220.7453125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.2115469369766417, "grad_norm": 0.22726862373109216, "kl": 0.006689453125, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 17877371.0, "reward": 0.8397867679595947, "reward_std": 0.09087342023849487, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8115137934684753, "rewards/qatch_metrics/std": 0.3017837733030319, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.4, "completions/max_terminated_length": 491.4, "completions/mean_length": 225.2140625, "completions/mean_terminated_length": 225.2140625, "completions/min_length": 75.4, "completions/min_terminated_length": 75.4, "epoch": 0.22036139268400176, "grad_norm": 0.2004953082769917, "kl": 0.00776519775390625, "learning_rate": 1e-06, "loss": -0.0056, "num_tokens": 18623005.0, "reward": 0.8202541828155517, "reward_std": 0.07537120208144188, "rewards/format_reward/mean": 0.99921875, "rewards/format_reward/std": 0.0125, "rewards/qatch_metrics/mean": 0.7886492252349854, "rewards/qatch_metrics/std": 0.32776339948177335, "rewards/tag_count_reward/mean": 0.999609375, "rewards/tag_count_reward/std": 0.00625, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.8, "completions/max_terminated_length": 456.8, "completions/mean_length": 223.48203125, "completions/mean_terminated_length": 223.48203125, "completions/min_length": 78.2, "completions/min_terminated_length": 78.2, "epoch": 0.22917584839136185, "grad_norm": 0.2341532579835068, "kl": 0.00804290771484375, "learning_rate": 1e-06, "loss": 0.0096, "num_tokens": 19349606.0, "reward": 0.8026262044906616, "reward_std": 0.06839245334267616, "rewards/format_reward/mean": 0.99921875, "rewards/format_reward/std": 0.0125, "rewards/qatch_metrics/mean": 0.7679218888282776, "rewards/qatch_metrics/std": 0.3324147403240204, "rewards/tag_count_reward/mean": 0.9994140625, "rewards/tag_count_reward/std": 0.009375, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.4, "completions/max_terminated_length": 458.4, "completions/mean_length": 216.72578125, "completions/mean_terminated_length": 216.72578125, "completions/min_length": 86.2, "completions/min_terminated_length": 86.2, "epoch": 0.2379903040987219, "grad_norm": 0.23655650548465582, "kl": 0.0078033447265625, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 20092311.0, "reward": 0.8197526335716248, "reward_std": 0.0839143767952919, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7879442930221557, "rewards/qatch_metrics/std": 0.3431123554706573, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.6, "completions/max_terminated_length": 454.6, "completions/mean_length": 204.48984375, "completions/mean_terminated_length": 204.48984375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.24680475980608196, "grad_norm": 0.2641797202959811, "kl": 0.00862884521484375, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 20821962.0, "reward": 0.8242111682891846, "reward_std": 0.07407020255923272, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7931895971298217, "rewards/qatch_metrics/std": 0.3176054835319519, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.2, "completions/max_terminated_length": 443.2, "completions/mean_length": 203.590625, "completions/mean_terminated_length": 203.590625, "completions/min_length": 86.6, "completions/min_terminated_length": 86.6, "epoch": 0.255619215513442, "grad_norm": 0.263066002535131, "kl": 0.009637451171875, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 21526046.0, "reward": 0.7875781059265137, "reward_std": 0.09901705384254456, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7501148462295533, "rewards/qatch_metrics/std": 0.3672972857952118, "rewards/tag_count_reward/mean": 0.999609375, "rewards/tag_count_reward/std": 0.00625, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.4, "completions/max_terminated_length": 448.4, "completions/mean_length": 208.90546875, "completions/mean_terminated_length": 208.90546875, "completions/min_length": 73.2, "completions/min_terminated_length": 73.2, "epoch": 0.26443367122080214, "grad_norm": 0.2798500312218402, "kl": 0.01026153564453125, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 22271333.0, "reward": 0.818337082862854, "reward_std": 0.07784928977489472, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7862788915634156, "rewards/qatch_metrics/std": 0.3341992735862732, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.2, "completions/max_terminated_length": 494.2, "completions/mean_length": 209.651953125, "completions/mean_terminated_length": 209.651953125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.5464962538563244, "grad_norm": 0.2122029879190087, "kl": 0.010993194580078126, "learning_rate": 1e-06, "loss": 0.0126, "num_tokens": 23726666.0, "reward": 0.811666476726532, "reward_std": 0.0841904804110527, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7784311413764954, "rewards/qatch_metrics/std": 0.32770459055900575, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.6, "completions/max_terminated_length": 452.6, "completions/mean_length": 217.9859375, "completions/mean_terminated_length": 217.9859375, "completions/min_length": 75.8, "completions/min_terminated_length": 75.8, "epoch": 0.5641251652710445, "grad_norm": 0.15403477284537095, "kl": 0.00980377197265625, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 25239750.0, "reward": 0.7868865132331848, "reward_std": 0.07244862839579583, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7492782354354859, "rewards/qatch_metrics/std": 0.3493395745754242, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 208.38984375, "completions/mean_terminated_length": 208.38984375, "completions/min_length": 58.4, "completions/min_terminated_length": 58.4, "epoch": 0.5817540766857646, "grad_norm": 0.18706575889421317, "kl": 0.00914154052734375, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 26687596.0, "reward": 0.828769075870514, "reward_std": 0.07729479111731052, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7985518336296081, "rewards/qatch_metrics/std": 0.29670341312885284, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 206.281640625, "completions/mean_terminated_length": 206.281640625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.5993829881004848, "grad_norm": 0.19776450858978561, "kl": 0.01090240478515625, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 28175773.0, "reward": 0.8511051416397095, "reward_std": 0.07431531846523284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8248295664787293, "rewards/qatch_metrics/std": 0.3192874014377594, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.2, "completions/max_terminated_length": 462.2, "completions/mean_length": 219.3890625, "completions/mean_terminated_length": 219.3890625, "completions/min_length": 70.8, "completions/min_terminated_length": 70.8, "epoch": 0.617011899515205, "grad_norm": 0.15290022120008429, "kl": 0.01065216064453125, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 29739969.0, "reward": 0.8426113128662109, "reward_std": 0.09004694148898125, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.814836847782135, "rewards/qatch_metrics/std": 0.309688937664032, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.8, "completions/max_terminated_length": 458.8, "completions/mean_length": 211.655078125, "completions/mean_terminated_length": 211.655078125, "completions/min_length": 73.4, "completions/min_terminated_length": 73.4, "epoch": 0.6346408109299251, "grad_norm": 0.17923424569681315, "kl": 0.0114501953125, "learning_rate": 1e-06, "loss": 0.01, "num_tokens": 31191502.0, "reward": 0.8262084484100342, "reward_std": 0.08637549504637718, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7955393195152283, "rewards/qatch_metrics/std": 0.3134327620267868, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.2, "completions/max_terminated_length": 499.2, "completions/mean_length": 215.95546875, "completions/mean_terminated_length": 215.95546875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.6522697223446452, "grad_norm": 0.1321015357675111, "kl": 0.012237548828125, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 32694108.0, "reward": 0.7994898676872253, "reward_std": 0.08254800513386726, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.764105749130249, "rewards/qatch_metrics/std": 0.3532308578491211, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.2, "completions/max_terminated_length": 442.2, "completions/mean_length": 209.90078125, "completions/mean_terminated_length": 209.90078125, "completions/min_length": 76.6, "completions/min_terminated_length": 76.6, "epoch": 0.6698986337593653, "grad_norm": 0.22256806005967145, "kl": 0.01057586669921875, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 34144670.0, "reward": 0.7911163926124573, "reward_std": 0.06518566869199276, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7542545795440674, "rewards/qatch_metrics/std": 0.35398219227790834, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.2, "completions/max_terminated_length": 476.2, "completions/mean_length": 208.534765625, "completions/mean_terminated_length": 208.534765625, "completions/min_length": 77.8, "completions/min_terminated_length": 77.8, "epoch": 0.6875275451740855, "grad_norm": 0.17237028945675698, "kl": 0.0087860107421875, "learning_rate": 1e-06, "loss": 0.0069, "num_tokens": 35620023.0, "reward": 0.8418472170829773, "reward_std": 0.08243692219257355, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8139379024505615, "rewards/qatch_metrics/std": 0.336453515291214, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.6, "completions/max_terminated_length": 514.6, "completions/mean_length": 217.3328125, "completions/mean_terminated_length": 217.3328125, "completions/min_length": 90.4, "completions/min_terminated_length": 90.4, "epoch": 0.7051564565888057, "grad_norm": 0.19274445010407998, "kl": 0.009130859375, "learning_rate": 1e-06, "loss": 0.0053, "num_tokens": 37166635.0, "reward": 0.8295193314552307, "reward_std": 0.06927115023136139, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7994345307350159, "rewards/qatch_metrics/std": 0.3011426508426666, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.8, "completions/max_terminated_length": 499.8, "completions/mean_length": 212.651171875, "completions/mean_terminated_length": 212.651171875, "completions/min_length": 68.6, "completions/min_terminated_length": 68.6, "epoch": 0.7227853680035258, "grad_norm": 0.13990900967805797, "kl": 0.0087432861328125, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 38617966.0, "reward": 0.8151894211769104, "reward_std": 0.07495353966951371, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7825757980346679, "rewards/qatch_metrics/std": 0.33874245882034304, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.4, "completions/max_terminated_length": 560.4, "completions/mean_length": 223.7015625, "completions/mean_terminated_length": 223.7015625, "completions/min_length": 74.6, "completions/min_terminated_length": 74.6, "epoch": 0.7404142794182459, "grad_norm": 0.20163985914598806, "kl": 0.00806884765625, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 40092050.0, "reward": 0.8460610270500183, "reward_std": 0.05867695920169354, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8188953161239624, "rewards/qatch_metrics/std": 0.3239317536354065, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.6, "completions/max_terminated_length": 486.6, "completions/mean_length": 215.6828125, "completions/mean_terminated_length": 215.6828125, "completions/min_length": 82.2, "completions/min_terminated_length": 82.2, "epoch": 0.7580431908329661, "grad_norm": 0.17564998217230318, "kl": 0.009525299072265625, "learning_rate": 1e-06, "loss": -0.0034, "num_tokens": 41565542.0, "reward": 0.799136507511139, "reward_std": 0.06419738680124283, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7636899828910828, "rewards/qatch_metrics/std": 0.3342160403728485, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.4, "completions/max_terminated_length": 503.4, "completions/mean_length": 233.409765625, "completions/mean_terminated_length": 233.409765625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.7756721022476862, "grad_norm": 0.19283324501226842, "kl": 0.009130096435546875, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 43081919.0, "reward": 0.7851791024208069, "reward_std": 0.07570808604359627, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7472695469856262, "rewards/qatch_metrics/std": 0.36822828054428103, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.4, "completions/max_terminated_length": 448.4, "completions/mean_length": 224.728125, "completions/mean_terminated_length": 224.728125, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.7933010136624064, "grad_norm": 0.17754847688569442, "kl": 0.009470367431640625, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 44606439.0, "reward": 0.8152384400367737, "reward_std": 0.09764492362737656, "rewards/format_reward/mean": 0.999609375, "rewards/format_reward/std": 0.00883883461356163, "rewards/qatch_metrics/mean": 0.7826851725578308, "rewards/qatch_metrics/std": 0.3263732075691223, "rewards/tag_count_reward/mean": 0.99990234375, "rewards/tag_count_reward/std": 0.0022097086533904076, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.2, "completions/max_terminated_length": 503.2, "completions/mean_length": 218.58203125, "completions/mean_terminated_length": 218.58203125, "completions/min_length": 69.6, "completions/min_terminated_length": 69.6, "epoch": 0.8109299250771265, "grad_norm": 0.19017267970498908, "kl": 0.009508514404296875, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 46095257.0, "reward": 0.8068280577659607, "reward_std": 0.0781441181898117, "rewards/format_reward/mean": 0.999609375, "rewards/format_reward/std": 0.00883883461356163, "rewards/qatch_metrics/mean": 0.7728020906448364, "rewards/qatch_metrics/std": 0.3386655867099762, "rewards/tag_count_reward/mean": 0.99970703125, "rewards/tag_count_reward/std": 0.006629125773906707, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.6, "completions/max_terminated_length": 456.6, "completions/mean_length": 204.84375, "completions/mean_terminated_length": 204.84375, "completions/min_length": 72.6, "completions/min_terminated_length": 72.6, "epoch": 0.8285588364918466, "grad_norm": 0.1678878918468119, "kl": 0.009729766845703125, "learning_rate": 1e-06, "loss": 0.0041, "num_tokens": 47519433.0, "reward": 0.8672606706619262, "reward_std": 0.0644603468477726, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8438360691070557, "rewards/qatch_metrics/std": 0.2717843741178513, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.2, "completions/max_terminated_length": 526.2, "completions/mean_length": 214.90625, "completions/mean_terminated_length": 214.90625, "completions/min_length": 66.8, "completions/min_terminated_length": 66.8, "epoch": 0.8461877479065668, "grad_norm": 0.18169011669761398, "kl": 0.01288604736328125, "learning_rate": 1e-06, "loss": 0.0035, "num_tokens": 48943993.0, "reward": 0.8558493018150329, "reward_std": 0.07027828097343444, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8304109454154969, "rewards/qatch_metrics/std": 0.301141357421875, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.6, "completions/max_terminated_length": 472.6, "completions/mean_length": 212.559765625, "completions/mean_terminated_length": 212.559765625, "completions/min_length": 76.8, "completions/min_terminated_length": 76.8, "epoch": 0.8638166593212869, "grad_norm": 0.2046340854229955, "kl": 0.01494140625, "learning_rate": 1e-06, "loss": 0.006, "num_tokens": 50416114.0, "reward": 0.831060528755188, "reward_std": 0.07754805404692888, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8012476563453674, "rewards/qatch_metrics/std": 0.3293557226657867, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.2, "completions/max_terminated_length": 503.2, "completions/mean_length": 222.1375, "completions/mean_terminated_length": 222.1375, "completions/min_length": 83.2, "completions/min_terminated_length": 83.2, "epoch": 0.881445570736007, "grad_norm": 0.15161264539796646, "kl": 0.0138031005859375, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 51932274.0, "reward": 0.8422249555587769, "reward_std": 0.06234893724322319, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8143823027610779, "rewards/qatch_metrics/std": 0.2993943512439728, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.6, "completions/max_terminated_length": 463.6, "completions/mean_length": 231.19609375, "completions/mean_terminated_length": 231.19609375, "completions/min_length": 77.4, "completions/min_terminated_length": 77.4, "epoch": 0.8990744821507272, "grad_norm": 0.20035266636054513, "kl": 0.011871337890625, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 53450248.0, "reward": 0.8096501588821411, "reward_std": 0.06698438860476016, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7760589838027954, "rewards/qatch_metrics/std": 0.3199191153049469, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.2, "completions/max_terminated_length": 471.2, "completions/mean_length": 237.80234375, "completions/mean_terminated_length": 237.80234375, "completions/min_length": 82.2, "completions/min_terminated_length": 82.2, "epoch": 0.9167033935654474, "grad_norm": 0.0856229450795828, "kl": 0.011614227294921875, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 54970542.0, "reward": 0.8725608706474304, "reward_std": 0.051827043667435645, "rewards/format_reward/mean": 0.999609375, "rewards/format_reward/std": 0.00883883461356163, "rewards/qatch_metrics/mean": 0.8501232981681823, "rewards/qatch_metrics/std": 0.26386110931634904, "rewards/tag_count_reward/mean": 0.99990234375, "rewards/tag_count_reward/std": 0.0022097086533904076, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.6, "completions/max_terminated_length": 476.6, "completions/mean_length": 231.53671875, "completions/mean_terminated_length": 231.53671875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.9343323049801675, "grad_norm": 0.17178453068271043, "kl": 0.010117340087890624, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 56485356.0, "reward": 0.8532873392105103, "reward_std": 0.07009301483631133, "rewards/format_reward/mean": 0.999609375, "rewards/format_reward/std": 0.00883883461356163, "rewards/qatch_metrics/mean": 0.8274485826492309, "rewards/qatch_metrics/std": 0.31240676045417787, "rewards/tag_count_reward/mean": 0.99990234375, "rewards/tag_count_reward/std": 0.0022097086533904076, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.6, "completions/max_terminated_length": 459.6, "completions/mean_length": 220.95234375, "completions/mean_terminated_length": 220.95234375, "completions/min_length": 68.6, "completions/min_terminated_length": 68.6, "epoch": 0.9519612163948876, "grad_norm": 0.15364550208264494, "kl": 0.00984039306640625, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 57953010.0, "reward": 0.868242597579956, "reward_std": 0.06916632130742073, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8449912905693054, "rewards/qatch_metrics/std": 0.2899660974740982, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.2, "completions/max_terminated_length": 423.2, "completions/mean_length": 225.621875, "completions/mean_terminated_length": 225.621875, "completions/min_length": 88.8, "completions/min_terminated_length": 88.8, "epoch": 0.48479506390480387, "grad_norm": 0.17697767584196022, "kl": 0.00970916748046875, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 58736110.0, "reward": 0.8460039258003235, "reward_std": 0.055821475386619565, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8188281297683716, "rewards/qatch_metrics/std": 0.30660555958747865, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 223.92734375, "completions/mean_terminated_length": 223.92734375, "completions/min_length": 77.8, "completions/min_terminated_length": 77.8, "epoch": 0.4936095196121639, "grad_norm": 0.2692630701899735, "kl": 0.0131378173828125, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 59498897.0, "reward": 0.7988754034042358, "reward_std": 0.08376505076885224, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7633828163146973, "rewards/qatch_metrics/std": 0.3335907101631165, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.8, "completions/max_terminated_length": 434.8, "completions/mean_length": 223.48125, "completions/mean_terminated_length": 223.48125, "completions/min_length": 83.2, "completions/min_terminated_length": 83.2, "epoch": 0.502423975319524, "grad_norm": 0.2666009697829767, "kl": 0.0107269287109375, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 60277897.0, "reward": 0.7720089554786682, "reward_std": 0.0594131164252758, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7317867279052734, "rewards/qatch_metrics/std": 0.33845625519752504, "rewards/tag_count_reward/mean": 0.9998046875, "rewards/tag_count_reward/std": 0.003125, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 219.62421875, "completions/mean_terminated_length": 219.62421875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.511238431026884, "grad_norm": 0.16876063412105669, "kl": 0.01141357421875, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 61033560.0, "reward": 0.7902166962623596, "reward_std": 0.0687429528683424, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7531961083412171, "rewards/qatch_metrics/std": 0.37054654359817507, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.2, "completions/max_terminated_length": 471.2, "completions/mean_length": 226.275, "completions/mean_terminated_length": 226.275, "completions/min_length": 80.6, "completions/min_terminated_length": 80.6, "epoch": 0.5200528867342442, "grad_norm": 0.26818466602074054, "kl": 0.0130706787109375, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 61786008.0, "reward": 0.7699209451675415, "reward_std": 0.07550354823470115, "rewards/format_reward/mean": 0.99921875, "rewards/format_reward/std": 0.0125, "rewards/qatch_metrics/mean": 0.7294221520423889, "rewards/qatch_metrics/std": 0.3492735385894775, "rewards/tag_count_reward/mean": 0.9998046875, "rewards/tag_count_reward/std": 0.003125, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.6, "completions/max_terminated_length": 484.6, "completions/mean_length": 247.2328125, "completions/mean_terminated_length": 247.2328125, "completions/min_length": 95.8, "completions/min_terminated_length": 95.8, "epoch": 0.5288673424416043, "grad_norm": 0.16485515882678206, "kl": 0.0113006591796875, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 62590434.0, "reward": 0.8454334974288941, "reward_std": 0.0570029616355896, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8181570172309875, "rewards/qatch_metrics/std": 0.2992805689573288, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.6, "completions/max_terminated_length": 480.6, "completions/mean_length": 235.16015625, "completions/mean_terminated_length": 235.16015625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.5376817981489643, "grad_norm": 0.27561378534620606, "kl": 0.01141510009765625, "learning_rate": 1e-06, "loss": 0.0097, "num_tokens": 63366287.0, "reward": 0.8380108118057251, "reward_std": 0.07530387155711651, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8094244718551635, "rewards/qatch_metrics/std": 0.30977231860160825, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.6, "completions/max_terminated_length": 458.6, "completions/mean_length": 215.646875, "completions/mean_terminated_length": 215.646875, "completions/min_length": 79.4, "completions/min_terminated_length": 79.4, "epoch": 0.5464962538563244, "grad_norm": 0.2018916915779266, "kl": 0.013714599609375, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 64097387.0, "reward": 0.8135073184967041, "reward_std": 0.05950811579823494, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7806198120117187, "rewards/qatch_metrics/std": 0.33523867428302767, "rewards/tag_count_reward/mean": 0.999609375, "rewards/tag_count_reward/std": 0.00625, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.4, "completions/max_terminated_length": 447.4, "completions/mean_length": 223.68515625, "completions/mean_terminated_length": 223.68515625, "completions/min_length": 92.4, "completions/min_terminated_length": 92.4, "epoch": 0.5553107095636844, "grad_norm": 0.1836962735356692, "kl": 0.0138214111328125, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 64869416.0, "reward": 0.8333834052085877, "reward_std": 0.07006162852048874, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8039804816246032, "rewards/qatch_metrics/std": 0.3219245493412018, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.8, "completions/max_terminated_length": 504.8, "completions/mean_length": 221.45390625, "completions/mean_terminated_length": 221.45390625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.5641251652710445, "grad_norm": 0.23250178423343035, "kl": 0.01497802734375, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 65613165.0, "reward": 0.8320096850395202, "reward_std": 0.053499556705355646, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8023643255233764, "rewards/qatch_metrics/std": 0.3343039393424988, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.6, "completions/max_terminated_length": 466.6, "completions/mean_length": 220.3984375, "completions/mean_terminated_length": 220.3984375, "completions/min_length": 72.6, "completions/min_terminated_length": 72.6, "epoch": 0.5729396209784046, "grad_norm": 0.09740281424559781, "kl": 0.0155609130859375, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 66336475.0, "reward": 0.8796087980270386, "reward_std": 0.05236431676894426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8583632946014405, "rewards/qatch_metrics/std": 0.2817832052707672, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.4, "completions/max_terminated_length": 459.4, "completions/mean_length": 225.27890625, "completions/mean_terminated_length": 225.27890625, "completions/min_length": 78.6, "completions/min_terminated_length": 78.6, "epoch": 0.5817540766857646, "grad_norm": 0.08354955287926201, "kl": 0.01513671875, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 67098736.0, "reward": 0.8658102512359619, "reward_std": 0.07466748803853988, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8421296954154969, "rewards/qatch_metrics/std": 0.2614422976970673, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.2, "completions/max_terminated_length": 447.2, "completions/mean_length": 220.01875, "completions/mean_terminated_length": 220.01875, "completions/min_length": 81.8, "completions/min_terminated_length": 81.8, "epoch": 0.5905685323931247, "grad_norm": 0.20574209747901576, "kl": 0.015081787109375, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 67847928.0, "reward": 0.865822184085846, "reward_std": 0.046268445625901225, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8421437621116639, "rewards/qatch_metrics/std": 0.29589260220527647, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 213.5953125, "completions/mean_terminated_length": 213.5953125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5993829881004848, "grad_norm": 0.2039975034177896, "kl": 0.0161651611328125, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 68585234.0, "reward": 0.8343551635742188, "reward_std": 0.0688902921974659, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8051237106323242, "rewards/qatch_metrics/std": 0.30847290754318235, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 203.69453125, "completions/mean_terminated_length": 203.69453125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.6081974438078449, "grad_norm": 0.26848084439203446, "kl": 0.014788818359375, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 69338379.0, "reward": 0.8848124146461487, "reward_std": 0.06373886093497276, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8644851684570313, "rewards/qatch_metrics/std": 0.26705425381660464, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.8, "completions/max_terminated_length": 446.8, "completions/mean_length": 221.1046875, "completions/mean_terminated_length": 221.1046875, "completions/min_length": 79.4, "completions/min_terminated_length": 79.4, "epoch": 0.617011899515205, "grad_norm": 0.2363792510293019, "kl": 0.019024658203125, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 70095473.0, "reward": 0.8130708336830139, "reward_std": 0.08477363213896752, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7800833344459533, "rewards/qatch_metrics/std": 0.3211198329925537, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.4, "completions/max_terminated_length": 458.4, "completions/mean_length": 234.00078125, "completions/mean_terminated_length": 234.00078125, "completions/min_length": 91.4, "completions/min_terminated_length": 91.4, "epoch": 0.625826355222565, "grad_norm": 0.1856420640121193, "kl": 0.019122314453125, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 70860290.0, "reward": 0.8471660256385803, "reward_std": 0.0506692998111248, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8201953172683716, "rewards/qatch_metrics/std": 0.30663308799266814, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.8, "completions/max_terminated_length": 452.8, "completions/mean_length": 241.72421875, "completions/mean_terminated_length": 241.72421875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.6346408109299251, "grad_norm": 0.22939974521057024, "kl": 0.01826171875, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 71648401.0, "reward": 0.8702264785766601, "reward_std": 0.0592925101518631, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8473252534866333, "rewards/qatch_metrics/std": 0.28537269234657286, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.6, "completions/max_terminated_length": 434.6, "completions/mean_length": 215.853125, "completions/mean_terminated_length": 215.853125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.6434552666372851, "grad_norm": 0.19883621919511643, "kl": 0.0163330078125, "learning_rate": 1e-06, "loss": 0.0073, "num_tokens": 72382693.0, "reward": 0.8091506719589233, "reward_std": 0.0635421834886074, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7754713773727417, "rewards/qatch_metrics/std": 0.3179103255271912, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.8, "completions/max_terminated_length": 496.8, "completions/mean_length": 213.1640625, "completions/mean_terminated_length": 213.1640625, "completions/min_length": 76.2, "completions/min_terminated_length": 76.2, "epoch": 0.6522697223446452, "grad_norm": 0.1916457590662772, "kl": 0.0175506591796875, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 73161111.0, "reward": 0.8094798445701599, "reward_std": 0.04875086285173893, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7758586168289184, "rewards/qatch_metrics/std": 0.32606661319732666, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.4, "completions/max_terminated_length": 480.4, "completions/mean_length": 222.75, "completions/mean_terminated_length": 222.75, "completions/min_length": 72.6, "completions/min_terminated_length": 72.6, "epoch": 0.6610841780520053, "grad_norm": 0.15787517122504152, "kl": 0.0181884765625, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 73905591.0, "reward": 0.89048171043396, "reward_std": 0.04932568361982703, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8711549639701843, "rewards/qatch_metrics/std": 0.2736783862113953, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.4, "completions/max_terminated_length": 561.4, "completions/mean_length": 234.4390625, "completions/mean_terminated_length": 234.4390625, "completions/min_length": 75.2, "completions/min_terminated_length": 75.2, "epoch": 0.6698986337593653, "grad_norm": 0.2653930596733297, "kl": 0.0174713134765625, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 74679801.0, "reward": 0.8243065714836121, "reward_std": 0.06958894729614258, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7933018207550049, "rewards/qatch_metrics/std": 0.3086866676807404, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.8, "completions/max_terminated_length": 511.8, "completions/mean_length": 243.73359375, "completions/mean_terminated_length": 243.73359375, "completions/min_length": 81.6, "completions/min_terminated_length": 81.6, "epoch": 0.6787130894667255, "grad_norm": 0.20233916054675122, "kl": 0.014093017578125, "learning_rate": 1e-06, "loss": 0.0048, "num_tokens": 75445892.0, "reward": 0.8653998494148254, "reward_std": 0.07132081612944603, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8416468739509583, "rewards/qatch_metrics/std": 0.3147186517715454, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.4, "completions/max_terminated_length": 447.4, "completions/mean_length": 228.43671875, "completions/mean_terminated_length": 228.43671875, "completions/min_length": 79.8, "completions/min_terminated_length": 79.8, "epoch": 0.6875275451740855, "grad_norm": 0.29996778931865303, "kl": 0.0146087646484375, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 76229251.0, "reward": 0.8502862334251404, "reward_std": 0.07314281612634659, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8238661646842956, "rewards/qatch_metrics/std": 0.3113024443387985, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 249.8734375, "completions/mean_terminated_length": 249.8734375, "completions/min_length": 84.4, "completions/min_terminated_length": 84.4, "epoch": 0.6963420008814456, "grad_norm": 0.2150032953896288, "kl": 0.017156982421875, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 77021793.0, "reward": 0.8494030237197876, "reward_std": 0.05776047557592392, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8228270888328553, "rewards/qatch_metrics/std": 0.3020846724510193, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.4, "completions/max_terminated_length": 471.4, "completions/mean_length": 247.9828125, "completions/mean_terminated_length": 247.9828125, "completions/min_length": 84.4, "completions/min_terminated_length": 84.4, "epoch": 0.7051564565888057, "grad_norm": 0.2754041387856829, "kl": 0.0148590087890625, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 77833451.0, "reward": 0.8363431453704834, "reward_std": 0.06054745838046074, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.807462501525879, "rewards/qatch_metrics/std": 0.29668720066547394, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.6, "completions/max_terminated_length": 442.6, "completions/mean_length": 225.4296875, "completions/mean_terminated_length": 225.4296875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.7139709122961657, "grad_norm": 0.22420011771594078, "kl": 0.017706298828125, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 78585793.0, "reward": 0.8382049560546875, "reward_std": 0.05150428526103497, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8096528768539428, "rewards/qatch_metrics/std": 0.2925006330013275, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.4, "completions/max_terminated_length": 618.4, "completions/mean_length": 219.3421875, "completions/mean_terminated_length": 219.3421875, "completions/min_length": 84.6, "completions/min_terminated_length": 84.6, "epoch": 0.7227853680035258, "grad_norm": 0.0986589707089894, "kl": 0.0170196533203125, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 79352135.0, "reward": 0.8465274453163147, "reward_std": 0.05231629386544227, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8194440126419067, "rewards/qatch_metrics/std": 0.3004340440034866, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.6, "completions/max_terminated_length": 470.6, "completions/mean_length": 213.9921875, "completions/mean_terminated_length": 213.9921875, "completions/min_length": 83.6, "completions/min_terminated_length": 83.6, "epoch": 0.7315998237108858, "grad_norm": 0.17969166348358623, "kl": 0.01600341796875, "learning_rate": 1e-06, "loss": 0.0196, "num_tokens": 80093021.0, "reward": 0.7899853944778442, "reward_std": 0.06183199286460876, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7529239773750305, "rewards/qatch_metrics/std": 0.32831716537475586, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.4, "completions/max_terminated_length": 458.4, "completions/mean_length": 208.25390625, "completions/mean_terminated_length": 208.25390625, "completions/min_length": 72.6, "completions/min_terminated_length": 72.6, "epoch": 0.7404142794182459, "grad_norm": 0.12360613268228073, "kl": 0.0170166015625, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 80810658.0, "reward": 0.8781363725662231, "reward_std": 0.04314489997923374, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8566309928894043, "rewards/qatch_metrics/std": 0.2832080274820328, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.2, "completions/max_terminated_length": 447.2, "completions/mean_length": 203.98046875, "completions/mean_terminated_length": 203.98046875, "completions/min_length": 79.2, "completions/min_terminated_length": 79.2, "epoch": 0.749228735125606, "grad_norm": 0.210810313322166, "kl": 0.0164581298828125, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 81548361.0, "reward": 0.8270991563796997, "reward_std": 0.06941422820091248, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7965872406959533, "rewards/qatch_metrics/std": 0.33117216229438784, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.6, "completions/max_terminated_length": 434.6, "completions/mean_length": 220.75703125, "completions/mean_terminated_length": 220.75703125, "completions/min_length": 80.4, "completions/min_terminated_length": 80.4, "epoch": 0.7580431908329661, "grad_norm": 0.21910688267881026, "kl": 0.016754150390625, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 82290706.0, "reward": 0.8464880228042603, "reward_std": 0.04884184449911118, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8193976640701294, "rewards/qatch_metrics/std": 0.28375020921230315, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.8, "completions/max_terminated_length": 451.8, "completions/mean_length": 223.1234375, "completions/mean_terminated_length": 223.1234375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.7668576465403262, "grad_norm": 0.26253720274856984, "kl": 0.0178009033203125, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 83056976.0, "reward": 0.8096219301223755, "reward_std": 0.07494284212589264, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7760257959365845, "rewards/qatch_metrics/std": 0.3492628037929535, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.2, "completions/max_terminated_length": 501.2, "completions/mean_length": 216.840625, "completions/mean_terminated_length": 216.840625, "completions/min_length": 88.8, "completions/min_terminated_length": 88.8, "epoch": 0.7756721022476862, "grad_norm": 0.27647079947407377, "kl": 0.0181732177734375, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 83805044.0, "reward": 0.7776495218276978, "reward_std": 0.056884029135108, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7384112119674683, "rewards/qatch_metrics/std": 0.3683965981006622, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 216.46015625, "completions/mean_terminated_length": 216.46015625, "completions/min_length": 78.2, "completions/min_terminated_length": 78.2, "epoch": 0.7844865579550463, "grad_norm": 0.20996305667402082, "kl": 0.0163116455078125, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 84571313.0, "reward": 0.8477118849754334, "reward_std": 0.06959039457142353, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8208375215530396, "rewards/qatch_metrics/std": 0.30095059871673585, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.4, "completions/max_terminated_length": 443.4, "completions/mean_length": 211.00234375, "completions/mean_terminated_length": 211.00234375, "completions/min_length": 86.2, "completions/min_terminated_length": 86.2, "epoch": 0.7933010136624064, "grad_norm": 0.15662206787116065, "kl": 0.0160797119140625, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 85319188.0, "reward": 0.8328658938407898, "reward_std": 0.05801869332790375, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8033716320991516, "rewards/qatch_metrics/std": 0.3037038058042526, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.2, "completions/max_terminated_length": 475.2, "completions/mean_length": 222.7078125, "completions/mean_terminated_length": 222.7078125, "completions/min_length": 80.8, "completions/min_terminated_length": 80.8, "epoch": 0.8021154693697664, "grad_norm": 0.19919629119501958, "kl": 0.01639404296875, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 86091774.0, "reward": 0.8358211517333984, "reward_std": 0.0607087716460228, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8068713665008544, "rewards/qatch_metrics/std": 0.30334635376930236, "rewards/tag_count_reward/mean": 0.999609375, "rewards/tag_count_reward/std": 0.00625, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.4, "completions/max_terminated_length": 418.4, "completions/mean_length": 221.0015625, "completions/mean_terminated_length": 221.0015625, "completions/min_length": 80.4, "completions/min_terminated_length": 80.4, "epoch": 0.8109299250771265, "grad_norm": 0.1419366062228353, "kl": 0.01617431640625, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 86848528.0, "reward": 0.8028954148292542, "reward_std": 0.06934207193553447, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7681122660636902, "rewards/qatch_metrics/std": 0.3390295565128326, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.6, "completions/max_terminated_length": 426.6, "completions/mean_length": 210.6140625, "completions/mean_terminated_length": 210.6140625, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.8197443807844865, "grad_norm": 0.16116384181364513, "kl": 0.0162078857421875, "learning_rate": 1e-06, "loss": 0.013, "num_tokens": 87564482.0, "reward": 0.8424649000167846, "reward_std": 0.040234316140413284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8146645903587342, "rewards/qatch_metrics/std": 0.2840981811285019, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.6, "completions/max_terminated_length": 447.6, "completions/mean_length": 195.89140625, "completions/mean_terminated_length": 195.89140625, "completions/min_length": 80.8, "completions/min_terminated_length": 80.8, "epoch": 0.8285588364918466, "grad_norm": 0.21075371504226795, "kl": 0.0193115234375, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 88255159.0, "reward": 0.8565711379051208, "reward_std": 0.06344871073961258, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8312601566314697, "rewards/qatch_metrics/std": 0.3075568675994873, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.4, "completions/max_terminated_length": 445.4, "completions/mean_length": 201.01484375, "completions/mean_terminated_length": 201.01484375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.8373732921992068, "grad_norm": 0.27204162033836665, "kl": 0.019976806640625, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 88945690.0, "reward": 0.8785177230834961, "reward_std": 0.06470721438527108, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8570796966552734, "rewards/qatch_metrics/std": 0.2825317859649658, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.4, "completions/max_terminated_length": 497.4, "completions/mean_length": 221.90859375, "completions/mean_terminated_length": 221.90859375, "completions/min_length": 81.2, "completions/min_terminated_length": 81.2, "epoch": 0.8461877479065668, "grad_norm": 0.19323853705899263, "kl": 0.0183746337890625, "learning_rate": 1e-06, "loss": 0.0041, "num_tokens": 89712373.0, "reward": 0.8555493712425232, "reward_std": 0.06230065375566483, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8300580739974975, "rewards/qatch_metrics/std": 0.28706649839878084, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.8, "completions/max_terminated_length": 498.8, "completions/mean_length": 228.7890625, "completions/mean_terminated_length": 228.7890625, "completions/min_length": 88.4, "completions/min_terminated_length": 88.4, "epoch": 0.8550022036139269, "grad_norm": 0.24770714763886528, "kl": 0.0176513671875, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 90520071.0, "reward": 0.8527018785476684, "reward_std": 0.062195781618356705, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8267080783843994, "rewards/qatch_metrics/std": 0.2996180385351181, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.6, "completions/max_terminated_length": 480.6, "completions/mean_length": 227.66875, "completions/mean_terminated_length": 227.66875, "completions/min_length": 83.2, "completions/min_terminated_length": 83.2, "epoch": 0.8638166593212869, "grad_norm": 0.16162980170931898, "kl": 0.0188812255859375, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 91278479.0, "reward": 0.8309607028961181, "reward_std": 0.0656251635402441, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8011302351951599, "rewards/qatch_metrics/std": 0.31802850365638735, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.4, "completions/max_terminated_length": 502.4, "completions/mean_length": 225.3171875, "completions/mean_terminated_length": 225.3171875, "completions/min_length": 80.4, "completions/min_terminated_length": 80.4, "epoch": 0.872631115028647, "grad_norm": 0.1886973597841831, "kl": 0.01859130859375, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 92033173.0, "reward": 0.8441248655319213, "reward_std": 0.043570340052247046, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8166174769401551, "rewards/qatch_metrics/std": 0.30278873145580293, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.6, "completions/max_terminated_length": 443.6, "completions/mean_length": 234.475, "completions/mean_terminated_length": 234.475, "completions/min_length": 99.8, "completions/min_terminated_length": 99.8, "epoch": 0.881445570736007, "grad_norm": 0.24444756963754977, "kl": 0.01798095703125, "learning_rate": 1e-06, "loss": 0.007, "num_tokens": 92808293.0, "reward": 0.8517020106315613, "reward_std": 0.06295906975865365, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.825531804561615, "rewards/qatch_metrics/std": 0.3100520223379135, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 215.0328125, "completions/mean_terminated_length": 215.0328125, "completions/min_length": 84.2, "completions/min_terminated_length": 84.2, "epoch": 0.8902600264433671, "grad_norm": 0.21103775626066984, "kl": 0.0171600341796875, "learning_rate": 1e-06, "loss": 0.0051, "num_tokens": 93563327.0, "reward": 0.8682243466377259, "reward_std": 0.04365142099559307, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8449697852134704, "rewards/qatch_metrics/std": 0.2696381151676178, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.4, "completions/max_terminated_length": 463.4, "completions/mean_length": 218.59453125, "completions/mean_terminated_length": 218.59453125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.8990744821507272, "grad_norm": 0.20107359914643413, "kl": 0.016455078125, "learning_rate": 1e-06, "loss": 0.0086, "num_tokens": 94333288.0, "reward": 0.8064153909683227, "reward_std": 0.06192653328180313, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.772253406047821, "rewards/qatch_metrics/std": 0.3227865040302277, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 206.5703125, "completions/mean_terminated_length": 206.5703125, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.9078889378580872, "grad_norm": 0.10741725097461949, "kl": 0.0163330078125, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 95051890.0, "reward": 0.8839513182640075, "reward_std": 0.04564618114382028, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8634721517562867, "rewards/qatch_metrics/std": 0.24794530421495437, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.2, "completions/max_terminated_length": 420.2, "completions/mean_length": 193.584375, "completions/mean_terminated_length": 193.584375, "completions/min_length": 74.8, "completions/min_terminated_length": 74.8, "epoch": 0.9167033935654474, "grad_norm": 0.3417922303720187, "kl": 0.0196563720703125, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 95755150.0, "reward": 0.8428452134132385, "reward_std": 0.05727057494223118, "rewards/format_reward/mean": 0.99921875, "rewards/format_reward/std": 0.0125, "rewards/qatch_metrics/mean": 0.8152039051055908, "rewards/qatch_metrics/std": 0.31376497745513915, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.8, "completions/max_terminated_length": 404.8, "completions/mean_length": 208.72890625, "completions/mean_terminated_length": 208.72890625, "completions/min_length": 72.2, "completions/min_terminated_length": 72.2, "epoch": 0.9255178492728074, "grad_norm": 0.17161657062686406, "kl": 0.0185943603515625, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 96514835.0, "reward": 0.8597602009773254, "reward_std": 0.044371549785137174, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8350119948387146, "rewards/qatch_metrics/std": 0.295586758852005, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.8, "completions/max_terminated_length": 426.8, "completions/mean_length": 212.95859375, "completions/mean_terminated_length": 212.95859375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.9343323049801675, "grad_norm": 0.22162383692372334, "kl": 0.0186981201171875, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 97270782.0, "reward": 0.8363440155982971, "reward_std": 0.06691965609788894, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8074635624885559, "rewards/qatch_metrics/std": 0.3064163327217102, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.6, "completions/max_terminated_length": 454.6, "completions/mean_length": 233.40234375, "completions/mean_terminated_length": 233.40234375, "completions/min_length": 76.4, "completions/min_terminated_length": 76.4, "epoch": 0.9431467606875276, "grad_norm": 0.1434511776519399, "kl": 0.019879150390625, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 98016705.0, "reward": 0.8363542199134827, "reward_std": 0.05200971700251102, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8074755430221557, "rewards/qatch_metrics/std": 0.2885085940361023, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.6, "completions/max_terminated_length": 444.6, "completions/mean_length": 235.70625, "completions/mean_terminated_length": 235.70625, "completions/min_length": 80.6, "completions/min_terminated_length": 80.6, "epoch": 0.9519612163948876, "grad_norm": 0.09221258199209693, "kl": 0.018701171875, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 98787193.0, "reward": 0.8677037119865417, "reward_std": 0.057669999450445174, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8443572998046875, "rewards/qatch_metrics/std": 0.288933590054512, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.2, "completions/max_terminated_length": 455.2, "completions/mean_length": 222.11875, "completions/mean_terminated_length": 222.11875, "completions/min_length": 74.6, "completions/min_terminated_length": 74.6, "epoch": 0.9607756721022477, "grad_norm": 0.1352237905149159, "kl": 0.018145751953125, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 99532081.0, "reward": 0.8805891752243042, "reward_std": 0.05483146589249373, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8595166802406311, "rewards/qatch_metrics/std": 0.25585181415081026, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.6, "completions/max_terminated_length": 472.6, "completions/mean_length": 218.659375, "completions/mean_terminated_length": 218.659375, "completions/min_length": 86.2, "completions/min_terminated_length": 86.2, "epoch": 0.9695901278096077, "grad_norm": 0.16904630982662794, "kl": 0.01783447265625, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 100246573.0, "reward": 0.8569401383399964, "reward_std": 0.07272802218794823, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8316942691802979, "rewards/qatch_metrics/std": 0.3041912466287613, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.8, "completions/max_terminated_length": 459.8, "completions/mean_length": 221.78984375, "completions/mean_terminated_length": 221.78984375, "completions/min_length": 77.8, "completions/min_terminated_length": 77.8, "epoch": 0.9784045835169678, "grad_norm": 0.31854687165087076, "kl": 0.0183258056640625, "learning_rate": 1e-06, "loss": -0.0058, "num_tokens": 100996640.0, "reward": 0.8102917551994324, "reward_std": 0.07570969834923744, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.7768138289451599, "rewards/qatch_metrics/std": 0.34436498284339906, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.6, "completions/max_terminated_length": 490.6, "completions/mean_length": 230.28828125, "completions/mean_terminated_length": 230.28828125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.9872190392243279, "grad_norm": 0.16545798735816303, "kl": 0.01719970703125, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 101777473.0, "reward": 0.854366683959961, "reward_std": 0.050544672086834906, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.8286666750907898, "rewards/qatch_metrics/std": 0.3027670204639435, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 232.278125, "completions/mean_terminated_length": 232.278125, "completions/min_length": 79.2, "completions/min_terminated_length": 79.2, "epoch": 0.996033494931688, "grad_norm": 0.2064718967348405, "kl": 0.020306396484375, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 102547669.0, "reward": 0.7918175339698792, "reward_std": 0.05684706475585699, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.755079448223114, "rewards/qatch_metrics/std": 0.3250477254390717, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.5, "completions/max_terminated_length": 468.5, "completions/mean_length": 214.265625, "completions/mean_terminated_length": 214.265625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.999559277214632, "kl": 0.01806640625, "num_tokens": 102823629.0, "reward": 0.8797399699687958, "reward_std": 0.056224397383630276, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.858517587184906, "rewards/qatch_metrics/std": 0.26497258245944977, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 567, "total_flos": 0.0, "train_loss": -1.6490349831877564e-05, "train_runtime": 5804.9117, "train_samples_per_second": 1.564, "train_steps_per_second": 0.098 } ], "logging_steps": 5, "max_steps": 567, "num_input_tokens_seen": 102823629, "num_train_epochs": 1, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }