Think2SQL-14B / trainer_state.json

Commit folder

27de8e9 verified 8 months ago

125 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.999559277214632,
	"eval_steps": 500,
	"global_step": 567,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 477.0,
	"completions/max_terminated_length": 477.0,
	"completions/mean_length": 175.50390625,
	"completions/mean_terminated_length": 175.50390625,
	"completions/min_length": 21.0,
	"completions/min_terminated_length": 21.0,
	"epoch": 0.0017628911414720142,
	"grad_norm": 1.0880173896572545,
	"kl": 0.0,
	"learning_rate": 0.0,
	"loss": -0.327,
	"num_tokens": 129409.0,
	"reward": 0.814777672290802,
	"reward_std": 0.14736539125442505,
	"rewards/format_reward/mean": 0.68359375,
	"rewards/format_reward/std": 0.4659844934940338,
	"rewards/qatch_metrics/mean": 0.8332747220993042,
	"rewards/qatch_metrics/std": 0.3284282088279724,
	"rewards/tag_count_reward/mean": 0.7626953125,
	"rewards/tag_count_reward/std": 0.34948837757110596,
	"step": 1
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 421.0,
	"completions/max_terminated_length": 421.0,
	"completions/mean_length": 177.318359375,
	"completions/mean_terminated_length": 177.318359375,
	"completions/min_length": 21.5,
	"completions/min_terminated_length": 21.5,
	"epoch": 0.00881445570736007,
	"grad_norm": 0.9499188530188546,
	"kl": 0.00019824504852294922,
	"learning_rate": 7.017543859649122e-08,
	"loss": -0.2902,
	"num_tokens": 685703.0,
	"reward": 0.762174516916275,
	"reward_std": 0.15002675727009773,
	"rewards/format_reward/mean": 0.7265625,
	"rewards/format_reward/std": 0.4450720399618149,
	"rewards/qatch_metrics/mean": 0.7644235193729401,
	"rewards/qatch_metrics/std": 0.3610532283782959,
	"rewards/tag_count_reward/mean": 0.795166015625,
	"rewards/tag_count_reward/std": 0.33385463058948517,
	"step": 5
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 438.8,
	"completions/max_terminated_length": 438.8,
	"completions/mean_length": 173.41171875,
	"completions/mean_terminated_length": 173.41171875,
	"completions/min_length": 21.8,
	"completions/min_terminated_length": 21.8,
	"epoch": 0.01762891141472014,
	"grad_norm": 0.9346895582900878,
	"kl": 0.00028295516967773436,
	"learning_rate": 1.5789473684210525e-07,
	"loss": -0.2591,
	"num_tokens": 1398566.0,
	"reward": 0.7710299372673035,
	"reward_std": 0.1539353460073471,
	"rewards/format_reward/mean": 0.71796875,
	"rewards/format_reward/std": 0.4487275779247284,
	"rewards/qatch_metrics/mean": 0.7762346506118775,
	"rewards/qatch_metrics/std": 0.3281721532344818,
	"rewards/tag_count_reward/mean": 0.788671875,
	"rewards/tag_count_reward/std": 0.33627479076385497,
	"step": 10
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 438.2,
	"completions/max_terminated_length": 438.2,
	"completions/mean_length": 183.1796875,
	"completions/mean_terminated_length": 183.1796875,
	"completions/min_length": 20.0,
	"completions/min_terminated_length": 20.0,
	"epoch": 0.026443367122080213,
	"grad_norm": 0.7943318239924386,
	"kl": 0.00037631988525390627,
	"learning_rate": 2.456140350877193e-07,
	"loss": -0.2603,
	"num_tokens": 2071996.0,
	"reward": 0.7256837129592896,
	"reward_std": 0.12991088777780532,
	"rewards/format_reward/mean": 0.765625,
	"rewards/format_reward/std": 0.4240167737007141,
	"rewards/qatch_metrics/mean": 0.7151770830154419,
	"rewards/qatch_metrics/std": 0.37596395611763,
	"rewards/tag_count_reward/mean": 0.8244140625,
	"rewards/tag_count_reward/std": 0.31790287494659425,
	"step": 15
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 479.6,
	"completions/max_terminated_length": 479.6,
	"completions/mean_length": 201.30234375,
	"completions/mean_terminated_length": 201.30234375,
	"completions/min_length": 21.2,
	"completions/min_terminated_length": 21.2,
	"epoch": 0.03525782282944028,
	"grad_norm": 0.4721344642723057,
	"kl": 0.00091400146484375,
	"learning_rate": 3.333333333333333e-07,
	"loss": -0.1315,
	"num_tokens": 2791247.0,
	"reward": 0.8173989057540894,
	"reward_std": 0.12794919013977052,
	"rewards/format_reward/mean": 0.89765625,
	"rewards/format_reward/std": 0.29814977645874025,
	"rewards/qatch_metrics/mean": 0.8017192721366883,
	"rewards/qatch_metrics/std": 0.331482595205307,
	"rewards/tag_count_reward/mean": 0.9234375,
	"rewards/tag_count_reward/std": 0.22307254374027252,
	"step": 20
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 432.2,
	"completions/max_terminated_length": 432.2,
	"completions/mean_length": 221.4625,
	"completions/mean_terminated_length": 221.4625,
	"completions/min_length": 51.4,
	"completions/min_terminated_length": 51.4,
	"epoch": 0.044072278536800354,
	"grad_norm": 0.29592079815815686,
	"kl": 0.0016038894653320312,
	"learning_rate": 4.2105263157894733e-07,
	"loss": -0.0424,
	"num_tokens": 3536975.0,
	"reward": 0.7564297676086426,
	"reward_std": 0.08200130835175515,
	"rewards/format_reward/mean": 0.96953125,
	"rewards/format_reward/std": 0.13422587364912034,
	"rewards/qatch_metrics/mean": 0.7183640837669373,
	"rewards/qatch_metrics/std": 0.3674669623374939,
	"rewards/tag_count_reward/mean": 0.97734375,
	"rewards/tag_count_reward/std": 0.09909781143069267,
	"step": 25
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 445.6,
	"completions/max_terminated_length": 445.6,
	"completions/mean_length": 216.53984375,
	"completions/mean_terminated_length": 216.53984375,
	"completions/min_length": 77.0,
	"completions/min_terminated_length": 77.0,
	"epoch": 0.052886734244160426,
	"grad_norm": 0.275794455786416,
	"kl": 0.0034694671630859375,
	"learning_rate": 5.087719298245614e-07,
	"loss": 0.002,
	"num_tokens": 4281330.0,
	"reward": 0.7764788866043091,
	"reward_std": 0.09769791960716248,
	"rewards/format_reward/mean": 0.9953125,
	"rewards/format_reward/std": 0.06028594672679901,
	"rewards/qatch_metrics/mean": 0.7377692699432373,
	"rewards/qatch_metrics/std": 0.3548368811607361,
	"rewards/tag_count_reward/mean": 0.996875,
	"rewards/tag_count_reward/std": 0.04124387204647064,
	"step": 30
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 445.8,
	"completions/max_terminated_length": 445.8,
	"completions/mean_length": 220.11796875,
	"completions/mean_terminated_length": 220.11796875,
	"completions/min_length": 59.8,
	"completions/min_terminated_length": 59.8,
	"epoch": 0.06170118995152049,
	"grad_norm": 0.2691159080285212,
	"kl": 0.005501174926757812,
	"learning_rate": 5.964912280701754e-07,
	"loss": -0.0083,
	"num_tokens": 5008025.0,
	"reward": 0.8268720507621765,
	"reward_std": 0.08243840038776398,
	"rewards/format_reward/mean": 0.99609375,
	"rewards/format_reward/std": 0.0625,
	"rewards/qatch_metrics/mean": 0.7969059944152832,
	"rewards/qatch_metrics/std": 0.30500164330005647,
	"rewards/tag_count_reward/mean": 0.9978515625,
	"rewards/tag_count_reward/std": 0.03437500074505806,
	"step": 35
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 487.2,
	"completions/max_terminated_length": 487.2,
	"completions/mean_length": 227.76015625,
	"completions/mean_terminated_length": 227.76015625,
	"completions/min_length": 83.4,
	"completions/min_terminated_length": 83.4,
	"epoch": 0.07051564565888056,
	"grad_norm": 0.33908836616855625,
	"kl": 0.002800750732421875,
	"learning_rate": 6.842105263157895e-07,
	"loss": 0.0002,
	"num_tokens": 5774806.0,
	"reward": 0.7647829532623291,
	"reward_std": 0.09533883556723595,
	"rewards/format_reward/mean": 0.9984375,
	"rewards/format_reward/std": 0.025,
	"rewards/qatch_metrics/mean": 0.7235268354415894,
	"rewards/qatch_metrics/std": 0.35323665738105775,
	"rewards/tag_count_reward/mean": 0.998828125,
	"rewards/tag_count_reward/std": 0.01875,
	"step": 40
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 476.2,
	"completions/max_terminated_length": 476.2,
	"completions/mean_length": 221.7984375,
	"completions/mean_terminated_length": 221.7984375,
	"completions/min_length": 83.4,
	"completions/min_terminated_length": 83.4,
	"epoch": 0.07933010136624064,
	"grad_norm": 0.3262303740341099,
	"kl": 0.00310516357421875,
	"learning_rate": 7.719298245614034e-07,
	"loss": 0.0104,
	"num_tokens": 6557268.0,
	"reward": 0.7565465092658996,
	"reward_std": 0.09911727011203766,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7135841250419617,
	"rewards/qatch_metrics/std": 0.37862626910209657,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 45
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 512.8,
	"completions/max_terminated_length": 512.8,
	"completions/mean_length": 228.45546875,
	"completions/mean_terminated_length": 228.45546875,
	"completions/min_length": 76.0,
	"completions/min_terminated_length": 76.0,
	"epoch": 0.08814455707360071,
	"grad_norm": 0.23276410584015308,
	"kl": 0.00273895263671875,
	"learning_rate": 8.596491228070175e-07,
	"loss": -0.0018,
	"num_tokens": 7327499.0,
	"reward": 0.7988326072692871,
	"reward_std": 0.06667622029781342,
	"rewards/format_reward/mean": 0.9984375,
	"rewards/format_reward/std": 0.025,
	"rewards/qatch_metrics/mean": 0.7635622501373291,
	"rewards/qatch_metrics/std": 0.369570130109787,
	"rewards/tag_count_reward/mean": 0.99921875,
	"rewards/tag_count_reward/std": 0.0125,
	"step": 50
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 483.8,
	"completions/max_terminated_length": 483.8,
	"completions/mean_length": 220.52734375,
	"completions/mean_terminated_length": 220.52734375,
	"completions/min_length": 81.2,
	"completions/min_terminated_length": 81.2,
	"epoch": 0.09695901278096078,
	"grad_norm": 0.28218074028465906,
	"kl": 0.00196533203125,
	"learning_rate": 9.473684210526315e-07,
	"loss": -0.0021,
	"num_tokens": 8077390.0,
	"reward": 0.8159880757331848,
	"reward_std": 0.10231453701853752,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7835153818130494,
	"rewards/qatch_metrics/std": 0.33782891631126405,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 55
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 481.2,
	"completions/max_terminated_length": 481.2,
	"completions/mean_length": 223.60703125,
	"completions/mean_terminated_length": 223.60703125,
	"completions/min_length": 75.6,
	"completions/min_terminated_length": 75.6,
	"epoch": 0.10577346848832085,
	"grad_norm": 0.23258401790732933,
	"kl": 0.00223388671875,
	"learning_rate": 1e-06,
	"loss": -0.0045,
	"num_tokens": 8800407.0,
	"reward": 0.74871985912323,
	"reward_std": 0.09312780797481537,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7043763160705566,
	"rewards/qatch_metrics/std": 0.39227073788642886,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 60
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 487.8,
	"completions/max_terminated_length": 487.8,
	"completions/mean_length": 222.81015625,
	"completions/mean_terminated_length": 222.81015625,
	"completions/min_length": 77.4,
	"completions/min_terminated_length": 77.4,
	"epoch": 0.11458792419568092,
	"grad_norm": 0.22445170455470606,
	"kl": 0.002956390380859375,
	"learning_rate": 1e-06,
	"loss": 0.0057,
	"num_tokens": 9557380.0,
	"reward": 0.8077908515930176,
	"reward_std": 0.09828853458166123,
	"rewards/format_reward/mean": 0.9984375,
	"rewards/format_reward/std": 0.025,
	"rewards/qatch_metrics/mean": 0.774078369140625,
	"rewards/qatch_metrics/std": 0.33206661343574523,
	"rewards/tag_count_reward/mean": 0.999609375,
	"rewards/tag_count_reward/std": 0.00625,
	"step": 65
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 492.6,
	"completions/max_terminated_length": 492.6,
	"completions/mean_length": 231.83984375,
	"completions/mean_terminated_length": 231.83984375,
	"completions/min_length": 94.6,
	"completions/min_terminated_length": 94.6,
	"epoch": 0.12340237990304098,
	"grad_norm": 0.22832903725685313,
	"kl": 0.00381317138671875,
	"learning_rate": 1e-06,
	"loss": 0.0025,
	"num_tokens": 10339127.0,
	"reward": 0.7895300030708313,
	"reward_std": 0.10415169298648834,
	"rewards/format_reward/mean": 0.9984375,
	"rewards/format_reward/std": 0.025,
	"rewards/qatch_metrics/mean": 0.7526065230369567,
	"rewards/qatch_metrics/std": 0.3542828977108002,
	"rewards/tag_count_reward/mean": 0.9994140625,
	"rewards/tag_count_reward/std": 0.009375,
	"step": 70
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 521.8,
	"completions/max_terminated_length": 521.8,
	"completions/mean_length": 236.3125,
	"completions/mean_terminated_length": 236.3125,
	"completions/min_length": 80.4,
	"completions/min_terminated_length": 80.4,
	"epoch": 0.13221683561040107,
	"grad_norm": 0.2597151805235052,
	"kl": 0.00432281494140625,
	"learning_rate": 1e-06,
	"loss": 0.0083,
	"num_tokens": 11147287.0,
	"reward": 0.7333161950111389,
	"reward_std": 0.08832715749740601,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.6862887978553772,
	"rewards/qatch_metrics/std": 0.36336439847946167,
	"rewards/tag_count_reward/mean": 0.9994140625,
	"rewards/tag_count_reward/std": 0.0069767430424690245,
	"step": 75
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 445.6,
	"completions/max_terminated_length": 445.6,
	"completions/mean_length": 216.43984375,
	"completions/mean_terminated_length": 216.43984375,
	"completions/min_length": 87.8,
	"completions/min_terminated_length": 87.8,
	"epoch": 0.14103129131776113,
	"grad_norm": 0.2463929158667687,
	"kl": 0.00528717041015625,
	"learning_rate": 1e-06,
	"loss": 0.0044,
	"num_tokens": 11891066.0,
	"reward": 0.8300724029541016,
	"reward_std": 0.09615504890680313,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8000851631164551,
	"rewards/qatch_metrics/std": 0.3208737909793854,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 80
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 491.6,
	"completions/max_terminated_length": 491.6,
	"completions/mean_length": 225.32890625,
	"completions/mean_terminated_length": 225.32890625,
	"completions/min_length": 86.2,
	"completions/min_terminated_length": 86.2,
	"epoch": 0.1498457470251212,
	"grad_norm": 0.22719354366888944,
	"kl": 0.005328369140625,
	"learning_rate": 1e-06,
	"loss": 0.0129,
	"num_tokens": 12668159.0,
	"reward": 0.816937243938446,
	"reward_std": 0.08283708170056343,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7846320390701294,
	"rewards/qatch_metrics/std": 0.32469419240951536,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 85
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 460.4,
	"completions/max_terminated_length": 460.4,
	"completions/mean_length": 217.92890625,
	"completions/mean_terminated_length": 217.92890625,
	"completions/min_length": 76.2,
	"completions/min_terminated_length": 76.2,
	"epoch": 0.15866020273248127,
	"grad_norm": 0.2721517170479785,
	"kl": 0.00579071044921875,
	"learning_rate": 1e-06,
	"loss": 0.0117,
	"num_tokens": 13413588.0,
	"reward": 0.7426301956176757,
	"reward_std": 0.0905102699995041,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.6972119808197021,
	"rewards/qatch_metrics/std": 0.37120566368103025,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 90
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 428.6,
	"completions/max_terminated_length": 428.6,
	"completions/mean_length": 204.6640625,
	"completions/mean_terminated_length": 204.6640625,
	"completions/min_length": 75.6,
	"completions/min_terminated_length": 75.6,
	"epoch": 0.16747465843984133,
	"grad_norm": 0.2525985499058037,
	"kl": 0.0056243896484375,
	"learning_rate": 1e-06,
	"loss": -0.0012,
	"num_tokens": 14111606.0,
	"reward": 0.7979554295539856,
	"reward_std": 0.06609301418066024,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7623119950294495,
	"rewards/qatch_metrics/std": 0.34469759464263916,
	"rewards/tag_count_reward/mean": 0.9998046875,
	"rewards/tag_count_reward/std": 0.003125,
	"step": 95
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 431.0,
	"completions/max_terminated_length": 431.0,
	"completions/mean_length": 212.34765625,
	"completions/mean_terminated_length": 212.34765625,
	"completions/min_length": 69.2,
	"completions/min_terminated_length": 69.2,
	"epoch": 0.17628911414720141,
	"grad_norm": 0.30357672091416305,
	"kl": 0.0057861328125,
	"learning_rate": 1e-06,
	"loss": 0.0083,
	"num_tokens": 14876659.0,
	"reward": 0.7724857568740845,
	"reward_std": 0.09265935122966766,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7323476672172546,
	"rewards/qatch_metrics/std": 0.33567925691604616,
	"rewards/tag_count_reward/mean": 0.9998046875,
	"rewards/tag_count_reward/std": 0.003125,
	"step": 100
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 463.4,
	"completions/max_terminated_length": 463.4,
	"completions/mean_length": 216.46875,
	"completions/mean_terminated_length": 216.46875,
	"completions/min_length": 80.4,
	"completions/min_terminated_length": 80.4,
	"epoch": 0.18510356985456147,
	"grad_norm": 0.23780324977532238,
	"kl": 0.0056549072265625,
	"learning_rate": 1e-06,
	"loss": -0.0087,
	"num_tokens": 15600331.0,
	"reward": 0.7508906722068787,
	"reward_std": 0.0951332688331604,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7069302201271057,
	"rewards/qatch_metrics/std": 0.38108278512954713,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 105
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 442.4,
	"completions/max_terminated_length": 442.4,
	"completions/mean_length": 216.578125,
	"completions/mean_terminated_length": 216.578125,
	"completions/min_length": 80.2,
	"completions/min_terminated_length": 80.2,
	"epoch": 0.19391802556192156,
	"grad_norm": 0.21716869090526136,
	"kl": 0.0054229736328125,
	"learning_rate": 1e-06,
	"loss": -0.0045,
	"num_tokens": 16326015.0,
	"reward": 0.8402611017227173,
	"reward_std": 0.05716411247849464,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8120718836784363,
	"rewards/qatch_metrics/std": 0.2929441839456558,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 110
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 428.2,
	"completions/max_terminated_length": 428.2,
	"completions/mean_length": 222.0265625,
	"completions/mean_terminated_length": 222.0265625,
	"completions/min_length": 78.0,
	"completions/min_terminated_length": 78.0,
	"epoch": 0.20273248126928162,
	"grad_norm": 0.22835452896575356,
	"kl": 0.0060882568359375,
	"learning_rate": 1e-06,
	"loss": -0.0017,
	"num_tokens": 17091921.0,
	"reward": 0.8265595078468323,
	"reward_std": 0.07398260906338691,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7959523558616638,
	"rewards/qatch_metrics/std": 0.3277123510837555,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 115
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 459.8,
	"completions/max_terminated_length": 459.8,
	"completions/mean_length": 220.7453125,
	"completions/mean_terminated_length": 220.7453125,
	"completions/min_length": 87.0,
	"completions/min_terminated_length": 87.0,
	"epoch": 0.2115469369766417,
	"grad_norm": 0.22726862373109216,
	"kl": 0.006689453125,
	"learning_rate": 1e-06,
	"loss": 0.0043,
	"num_tokens": 17877371.0,
	"reward": 0.8397867679595947,
	"reward_std": 0.09087342023849487,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8115137934684753,
	"rewards/qatch_metrics/std": 0.3017837733030319,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 120
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 491.4,
	"completions/max_terminated_length": 491.4,
	"completions/mean_length": 225.2140625,
	"completions/mean_terminated_length": 225.2140625,
	"completions/min_length": 75.4,
	"completions/min_terminated_length": 75.4,
	"epoch": 0.22036139268400176,
	"grad_norm": 0.2004953082769917,
	"kl": 0.00776519775390625,
	"learning_rate": 1e-06,
	"loss": -0.0056,
	"num_tokens": 18623005.0,
	"reward": 0.8202541828155517,
	"reward_std": 0.07537120208144188,
	"rewards/format_reward/mean": 0.99921875,
	"rewards/format_reward/std": 0.0125,
	"rewards/qatch_metrics/mean": 0.7886492252349854,
	"rewards/qatch_metrics/std": 0.32776339948177335,
	"rewards/tag_count_reward/mean": 0.999609375,
	"rewards/tag_count_reward/std": 0.00625,
	"step": 125
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 456.8,
	"completions/max_terminated_length": 456.8,
	"completions/mean_length": 223.48203125,
	"completions/mean_terminated_length": 223.48203125,
	"completions/min_length": 78.2,
	"completions/min_terminated_length": 78.2,
	"epoch": 0.22917584839136185,
	"grad_norm": 0.2341532579835068,
	"kl": 0.00804290771484375,
	"learning_rate": 1e-06,
	"loss": 0.0096,
	"num_tokens": 19349606.0,
	"reward": 0.8026262044906616,
	"reward_std": 0.06839245334267616,
	"rewards/format_reward/mean": 0.99921875,
	"rewards/format_reward/std": 0.0125,
	"rewards/qatch_metrics/mean": 0.7679218888282776,
	"rewards/qatch_metrics/std": 0.3324147403240204,
	"rewards/tag_count_reward/mean": 0.9994140625,
	"rewards/tag_count_reward/std": 0.009375,
	"step": 130
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 458.4,
	"completions/max_terminated_length": 458.4,
	"completions/mean_length": 216.72578125,
	"completions/mean_terminated_length": 216.72578125,
	"completions/min_length": 86.2,
	"completions/min_terminated_length": 86.2,
	"epoch": 0.2379903040987219,
	"grad_norm": 0.23655650548465582,
	"kl": 0.0078033447265625,
	"learning_rate": 1e-06,
	"loss": 0.0007,
	"num_tokens": 20092311.0,
	"reward": 0.8197526335716248,
	"reward_std": 0.0839143767952919,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7879442930221557,
	"rewards/qatch_metrics/std": 0.3431123554706573,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 135
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 454.6,
	"completions/max_terminated_length": 454.6,
	"completions/mean_length": 204.48984375,
	"completions/mean_terminated_length": 204.48984375,
	"completions/min_length": 79.0,
	"completions/min_terminated_length": 79.0,
	"epoch": 0.24680475980608196,
	"grad_norm": 0.2641797202959811,
	"kl": 0.00862884521484375,
	"learning_rate": 1e-06,
	"loss": 0.0051,
	"num_tokens": 20821962.0,
	"reward": 0.8242111682891846,
	"reward_std": 0.07407020255923272,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7931895971298217,
	"rewards/qatch_metrics/std": 0.3176054835319519,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 140
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 443.2,
	"completions/max_terminated_length": 443.2,
	"completions/mean_length": 203.590625,
	"completions/mean_terminated_length": 203.590625,
	"completions/min_length": 86.6,
	"completions/min_terminated_length": 86.6,
	"epoch": 0.255619215513442,
	"grad_norm": 0.263066002535131,
	"kl": 0.009637451171875,
	"learning_rate": 1e-06,
	"loss": 0.0065,
	"num_tokens": 21526046.0,
	"reward": 0.7875781059265137,
	"reward_std": 0.09901705384254456,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7501148462295533,
	"rewards/qatch_metrics/std": 0.3672972857952118,
	"rewards/tag_count_reward/mean": 0.999609375,
	"rewards/tag_count_reward/std": 0.00625,
	"step": 145
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 448.4,
	"completions/max_terminated_length": 448.4,
	"completions/mean_length": 208.90546875,
	"completions/mean_terminated_length": 208.90546875,
	"completions/min_length": 73.2,
	"completions/min_terminated_length": 73.2,
	"epoch": 0.26443367122080214,
	"grad_norm": 0.2798500312218402,
	"kl": 0.01026153564453125,
	"learning_rate": 1e-06,
	"loss": 0.0003,
	"num_tokens": 22271333.0,
	"reward": 0.818337082862854,
	"reward_std": 0.07784928977489472,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7862788915634156,
	"rewards/qatch_metrics/std": 0.3341992735862732,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 150
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 494.2,
	"completions/max_terminated_length": 494.2,
	"completions/mean_length": 209.651953125,
	"completions/mean_terminated_length": 209.651953125,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"epoch": 0.5464962538563244,
	"grad_norm": 0.2122029879190087,
	"kl": 0.010993194580078126,
	"learning_rate": 1e-06,
	"loss": 0.0126,
	"num_tokens": 23726666.0,
	"reward": 0.811666476726532,
	"reward_std": 0.0841904804110527,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7784311413764954,
	"rewards/qatch_metrics/std": 0.32770459055900575,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 155
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 452.6,
	"completions/max_terminated_length": 452.6,
	"completions/mean_length": 217.9859375,
	"completions/mean_terminated_length": 217.9859375,
	"completions/min_length": 75.8,
	"completions/min_terminated_length": 75.8,
	"epoch": 0.5641251652710445,
	"grad_norm": 0.15403477284537095,
	"kl": 0.00980377197265625,
	"learning_rate": 1e-06,
	"loss": 0.0008,
	"num_tokens": 25239750.0,
	"reward": 0.7868865132331848,
	"reward_std": 0.07244862839579583,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7492782354354859,
	"rewards/qatch_metrics/std": 0.3493395745754242,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 160
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 511.0,
	"completions/max_terminated_length": 511.0,
	"completions/mean_length": 208.38984375,
	"completions/mean_terminated_length": 208.38984375,
	"completions/min_length": 58.4,
	"completions/min_terminated_length": 58.4,
	"epoch": 0.5817540766857646,
	"grad_norm": 0.18706575889421317,
	"kl": 0.00914154052734375,
	"learning_rate": 1e-06,
	"loss": 0.0072,
	"num_tokens": 26687596.0,
	"reward": 0.828769075870514,
	"reward_std": 0.07729479111731052,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7985518336296081,
	"rewards/qatch_metrics/std": 0.29670341312885284,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 165
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 505.0,
	"completions/max_terminated_length": 505.0,
	"completions/mean_length": 206.281640625,
	"completions/mean_terminated_length": 206.281640625,
	"completions/min_length": 79.0,
	"completions/min_terminated_length": 79.0,
	"epoch": 0.5993829881004848,
	"grad_norm": 0.19776450858978561,
	"kl": 0.01090240478515625,
	"learning_rate": 1e-06,
	"loss": 0.0105,
	"num_tokens": 28175773.0,
	"reward": 0.8511051416397095,
	"reward_std": 0.07431531846523284,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8248295664787293,
	"rewards/qatch_metrics/std": 0.3192874014377594,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 170
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 462.2,
	"completions/max_terminated_length": 462.2,
	"completions/mean_length": 219.3890625,
	"completions/mean_terminated_length": 219.3890625,
	"completions/min_length": 70.8,
	"completions/min_terminated_length": 70.8,
	"epoch": 0.617011899515205,
	"grad_norm": 0.15290022120008429,
	"kl": 0.01065216064453125,
	"learning_rate": 1e-06,
	"loss": 0.0047,
	"num_tokens": 29739969.0,
	"reward": 0.8426113128662109,
	"reward_std": 0.09004694148898125,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.814836847782135,
	"rewards/qatch_metrics/std": 0.309688937664032,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 175
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 458.8,
	"completions/max_terminated_length": 458.8,
	"completions/mean_length": 211.655078125,
	"completions/mean_terminated_length": 211.655078125,
	"completions/min_length": 73.4,
	"completions/min_terminated_length": 73.4,
	"epoch": 0.6346408109299251,
	"grad_norm": 0.17923424569681315,
	"kl": 0.0114501953125,
	"learning_rate": 1e-06,
	"loss": 0.01,
	"num_tokens": 31191502.0,
	"reward": 0.8262084484100342,
	"reward_std": 0.08637549504637718,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7955393195152283,
	"rewards/qatch_metrics/std": 0.3134327620267868,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 180
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 499.2,
	"completions/max_terminated_length": 499.2,
	"completions/mean_length": 215.95546875,
	"completions/mean_terminated_length": 215.95546875,
	"completions/min_length": 78.0,
	"completions/min_terminated_length": 78.0,
	"epoch": 0.6522697223446452,
	"grad_norm": 0.1321015357675111,
	"kl": 0.012237548828125,
	"learning_rate": 1e-06,
	"loss": 0.0021,
	"num_tokens": 32694108.0,
	"reward": 0.7994898676872253,
	"reward_std": 0.08254800513386726,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.764105749130249,
	"rewards/qatch_metrics/std": 0.3532308578491211,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 185
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 442.2,
	"completions/max_terminated_length": 442.2,
	"completions/mean_length": 209.90078125,
	"completions/mean_terminated_length": 209.90078125,
	"completions/min_length": 76.6,
	"completions/min_terminated_length": 76.6,
	"epoch": 0.6698986337593653,
	"grad_norm": 0.22256806005967145,
	"kl": 0.01057586669921875,
	"learning_rate": 1e-06,
	"loss": 0.0013,
	"num_tokens": 34144670.0,
	"reward": 0.7911163926124573,
	"reward_std": 0.06518566869199276,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7542545795440674,
	"rewards/qatch_metrics/std": 0.35398219227790834,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 190
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 476.2,
	"completions/max_terminated_length": 476.2,
	"completions/mean_length": 208.534765625,
	"completions/mean_terminated_length": 208.534765625,
	"completions/min_length": 77.8,
	"completions/min_terminated_length": 77.8,
	"epoch": 0.6875275451740855,
	"grad_norm": 0.17237028945675698,
	"kl": 0.0087860107421875,
	"learning_rate": 1e-06,
	"loss": 0.0069,
	"num_tokens": 35620023.0,
	"reward": 0.8418472170829773,
	"reward_std": 0.08243692219257355,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8139379024505615,
	"rewards/qatch_metrics/std": 0.336453515291214,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 195
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 514.6,
	"completions/max_terminated_length": 514.6,
	"completions/mean_length": 217.3328125,
	"completions/mean_terminated_length": 217.3328125,
	"completions/min_length": 90.4,
	"completions/min_terminated_length": 90.4,
	"epoch": 0.7051564565888057,
	"grad_norm": 0.19274445010407998,
	"kl": 0.009130859375,
	"learning_rate": 1e-06,
	"loss": 0.0053,
	"num_tokens": 37166635.0,
	"reward": 0.8295193314552307,
	"reward_std": 0.06927115023136139,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7994345307350159,
	"rewards/qatch_metrics/std": 0.3011426508426666,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 200
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 499.8,
	"completions/max_terminated_length": 499.8,
	"completions/mean_length": 212.651171875,
	"completions/mean_terminated_length": 212.651171875,
	"completions/min_length": 68.6,
	"completions/min_terminated_length": 68.6,
	"epoch": 0.7227853680035258,
	"grad_norm": 0.13990900967805797,
	"kl": 0.0087432861328125,
	"learning_rate": 1e-06,
	"loss": -0.0027,
	"num_tokens": 38617966.0,
	"reward": 0.8151894211769104,
	"reward_std": 0.07495353966951371,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7825757980346679,
	"rewards/qatch_metrics/std": 0.33874245882034304,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 205
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 560.4,
	"completions/max_terminated_length": 560.4,
	"completions/mean_length": 223.7015625,
	"completions/mean_terminated_length": 223.7015625,
	"completions/min_length": 74.6,
	"completions/min_terminated_length": 74.6,
	"epoch": 0.7404142794182459,
	"grad_norm": 0.20163985914598806,
	"kl": 0.00806884765625,
	"learning_rate": 1e-06,
	"loss": 0.0054,
	"num_tokens": 40092050.0,
	"reward": 0.8460610270500183,
	"reward_std": 0.05867695920169354,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8188953161239624,
	"rewards/qatch_metrics/std": 0.3239317536354065,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 210
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 486.6,
	"completions/max_terminated_length": 486.6,
	"completions/mean_length": 215.6828125,
	"completions/mean_terminated_length": 215.6828125,
	"completions/min_length": 82.2,
	"completions/min_terminated_length": 82.2,
	"epoch": 0.7580431908329661,
	"grad_norm": 0.17564998217230318,
	"kl": 0.009525299072265625,
	"learning_rate": 1e-06,
	"loss": -0.0034,
	"num_tokens": 41565542.0,
	"reward": 0.799136507511139,
	"reward_std": 0.06419738680124283,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7636899828910828,
	"rewards/qatch_metrics/std": 0.3342160403728485,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 215
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 503.4,
	"completions/max_terminated_length": 503.4,
	"completions/mean_length": 233.409765625,
	"completions/mean_terminated_length": 233.409765625,
	"completions/min_length": 91.0,
	"completions/min_terminated_length": 91.0,
	"epoch": 0.7756721022476862,
	"grad_norm": 0.19283324501226842,
	"kl": 0.009130096435546875,
	"learning_rate": 1e-06,
	"loss": 0.0109,
	"num_tokens": 43081919.0,
	"reward": 0.7851791024208069,
	"reward_std": 0.07570808604359627,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7472695469856262,
	"rewards/qatch_metrics/std": 0.36822828054428103,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 220
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 448.4,
	"completions/max_terminated_length": 448.4,
	"completions/mean_length": 224.728125,
	"completions/mean_terminated_length": 224.728125,
	"completions/min_length": 81.0,
	"completions/min_terminated_length": 81.0,
	"epoch": 0.7933010136624064,
	"grad_norm": 0.17754847688569442,
	"kl": 0.009470367431640625,
	"learning_rate": 1e-06,
	"loss": -0.002,
	"num_tokens": 44606439.0,
	"reward": 0.8152384400367737,
	"reward_std": 0.09764492362737656,
	"rewards/format_reward/mean": 0.999609375,
	"rewards/format_reward/std": 0.00883883461356163,
	"rewards/qatch_metrics/mean": 0.7826851725578308,
	"rewards/qatch_metrics/std": 0.3263732075691223,
	"rewards/tag_count_reward/mean": 0.99990234375,
	"rewards/tag_count_reward/std": 0.0022097086533904076,
	"step": 225
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 503.2,
	"completions/max_terminated_length": 503.2,
	"completions/mean_length": 218.58203125,
	"completions/mean_terminated_length": 218.58203125,
	"completions/min_length": 69.6,
	"completions/min_terminated_length": 69.6,
	"epoch": 0.8109299250771265,
	"grad_norm": 0.19017267970498908,
	"kl": 0.009508514404296875,
	"learning_rate": 1e-06,
	"loss": -0.0009,
	"num_tokens": 46095257.0,
	"reward": 0.8068280577659607,
	"reward_std": 0.0781441181898117,
	"rewards/format_reward/mean": 0.999609375,
	"rewards/format_reward/std": 0.00883883461356163,
	"rewards/qatch_metrics/mean": 0.7728020906448364,
	"rewards/qatch_metrics/std": 0.3386655867099762,
	"rewards/tag_count_reward/mean": 0.99970703125,
	"rewards/tag_count_reward/std": 0.006629125773906707,
	"step": 230
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 456.6,
	"completions/max_terminated_length": 456.6,
	"completions/mean_length": 204.84375,
	"completions/mean_terminated_length": 204.84375,
	"completions/min_length": 72.6,
	"completions/min_terminated_length": 72.6,
	"epoch": 0.8285588364918466,
	"grad_norm": 0.1678878918468119,
	"kl": 0.009729766845703125,
	"learning_rate": 1e-06,
	"loss": 0.0041,
	"num_tokens": 47519433.0,
	"reward": 0.8672606706619262,
	"reward_std": 0.0644603468477726,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8438360691070557,
	"rewards/qatch_metrics/std": 0.2717843741178513,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 235
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 526.2,
	"completions/max_terminated_length": 526.2,
	"completions/mean_length": 214.90625,
	"completions/mean_terminated_length": 214.90625,
	"completions/min_length": 66.8,
	"completions/min_terminated_length": 66.8,
	"epoch": 0.8461877479065668,
	"grad_norm": 0.18169011669761398,
	"kl": 0.01288604736328125,
	"learning_rate": 1e-06,
	"loss": 0.0035,
	"num_tokens": 48943993.0,
	"reward": 0.8558493018150329,
	"reward_std": 0.07027828097343444,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8304109454154969,
	"rewards/qatch_metrics/std": 0.301141357421875,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 240
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 472.6,
	"completions/max_terminated_length": 472.6,
	"completions/mean_length": 212.559765625,
	"completions/mean_terminated_length": 212.559765625,
	"completions/min_length": 76.8,
	"completions/min_terminated_length": 76.8,
	"epoch": 0.8638166593212869,
	"grad_norm": 0.2046340854229955,
	"kl": 0.01494140625,
	"learning_rate": 1e-06,
	"loss": 0.006,
	"num_tokens": 50416114.0,
	"reward": 0.831060528755188,
	"reward_std": 0.07754805404692888,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8012476563453674,
	"rewards/qatch_metrics/std": 0.3293557226657867,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 245
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 503.2,
	"completions/max_terminated_length": 503.2,
	"completions/mean_length": 222.1375,
	"completions/mean_terminated_length": 222.1375,
	"completions/min_length": 83.2,
	"completions/min_terminated_length": 83.2,
	"epoch": 0.881445570736007,
	"grad_norm": 0.15161264539796646,
	"kl": 0.0138031005859375,
	"learning_rate": 1e-06,
	"loss": 0.0018,
	"num_tokens": 51932274.0,
	"reward": 0.8422249555587769,
	"reward_std": 0.06234893724322319,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8143823027610779,
	"rewards/qatch_metrics/std": 0.2993943512439728,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 250
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 463.6,
	"completions/max_terminated_length": 463.6,
	"completions/mean_length": 231.19609375,
	"completions/mean_terminated_length": 231.19609375,
	"completions/min_length": 77.4,
	"completions/min_terminated_length": 77.4,
	"epoch": 0.8990744821507272,
	"grad_norm": 0.20035266636054513,
	"kl": 0.011871337890625,
	"learning_rate": 1e-06,
	"loss": -0.0003,
	"num_tokens": 53450248.0,
	"reward": 0.8096501588821411,
	"reward_std": 0.06698438860476016,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7760589838027954,
	"rewards/qatch_metrics/std": 0.3199191153049469,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 255
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 471.2,
	"completions/max_terminated_length": 471.2,
	"completions/mean_length": 237.80234375,
	"completions/mean_terminated_length": 237.80234375,
	"completions/min_length": 82.2,
	"completions/min_terminated_length": 82.2,
	"epoch": 0.9167033935654474,
	"grad_norm": 0.0856229450795828,
	"kl": 0.011614227294921875,
	"learning_rate": 1e-06,
	"loss": -0.0017,
	"num_tokens": 54970542.0,
	"reward": 0.8725608706474304,
	"reward_std": 0.051827043667435645,
	"rewards/format_reward/mean": 0.999609375,
	"rewards/format_reward/std": 0.00883883461356163,
	"rewards/qatch_metrics/mean": 0.8501232981681823,
	"rewards/qatch_metrics/std": 0.26386110931634904,
	"rewards/tag_count_reward/mean": 0.99990234375,
	"rewards/tag_count_reward/std": 0.0022097086533904076,
	"step": 260
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 476.6,
	"completions/max_terminated_length": 476.6,
	"completions/mean_length": 231.53671875,
	"completions/mean_terminated_length": 231.53671875,
	"completions/min_length": 79.0,
	"completions/min_terminated_length": 79.0,
	"epoch": 0.9343323049801675,
	"grad_norm": 0.17178453068271043,
	"kl": 0.010117340087890624,
	"learning_rate": 1e-06,
	"loss": 0.0063,
	"num_tokens": 56485356.0,
	"reward": 0.8532873392105103,
	"reward_std": 0.07009301483631133,
	"rewards/format_reward/mean": 0.999609375,
	"rewards/format_reward/std": 0.00883883461356163,
	"rewards/qatch_metrics/mean": 0.8274485826492309,
	"rewards/qatch_metrics/std": 0.31240676045417787,
	"rewards/tag_count_reward/mean": 0.99990234375,
	"rewards/tag_count_reward/std": 0.0022097086533904076,
	"step": 265
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 459.6,
	"completions/max_terminated_length": 459.6,
	"completions/mean_length": 220.95234375,
	"completions/mean_terminated_length": 220.95234375,
	"completions/min_length": 68.6,
	"completions/min_terminated_length": 68.6,
	"epoch": 0.9519612163948876,
	"grad_norm": 0.15364550208264494,
	"kl": 0.00984039306640625,
	"learning_rate": 1e-06,
	"loss": -0.0031,
	"num_tokens": 57953010.0,
	"reward": 0.868242597579956,
	"reward_std": 0.06916632130742073,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8449912905693054,
	"rewards/qatch_metrics/std": 0.2899660974740982,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 270
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 423.2,
	"completions/max_terminated_length": 423.2,
	"completions/mean_length": 225.621875,
	"completions/mean_terminated_length": 225.621875,
	"completions/min_length": 88.8,
	"completions/min_terminated_length": 88.8,
	"epoch": 0.48479506390480387,
	"grad_norm": 0.17697767584196022,
	"kl": 0.00970916748046875,
	"learning_rate": 1e-06,
	"loss": 0.0059,
	"num_tokens": 58736110.0,
	"reward": 0.8460039258003235,
	"reward_std": 0.055821475386619565,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8188281297683716,
	"rewards/qatch_metrics/std": 0.30660555958747865,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 275
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 504.0,
	"completions/max_terminated_length": 504.0,
	"completions/mean_length": 223.92734375,
	"completions/mean_terminated_length": 223.92734375,
	"completions/min_length": 77.8,
	"completions/min_terminated_length": 77.8,
	"epoch": 0.4936095196121639,
	"grad_norm": 0.2692630701899735,
	"kl": 0.0131378173828125,
	"learning_rate": 1e-06,
	"loss": 0.0021,
	"num_tokens": 59498897.0,
	"reward": 0.7988754034042358,
	"reward_std": 0.08376505076885224,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7633828163146973,
	"rewards/qatch_metrics/std": 0.3335907101631165,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 280
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 434.8,
	"completions/max_terminated_length": 434.8,
	"completions/mean_length": 223.48125,
	"completions/mean_terminated_length": 223.48125,
	"completions/min_length": 83.2,
	"completions/min_terminated_length": 83.2,
	"epoch": 0.502423975319524,
	"grad_norm": 0.2666009697829767,
	"kl": 0.0107269287109375,
	"learning_rate": 1e-06,
	"loss": 0.0007,
	"num_tokens": 60277897.0,
	"reward": 0.7720089554786682,
	"reward_std": 0.0594131164252758,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7317867279052734,
	"rewards/qatch_metrics/std": 0.33845625519752504,
	"rewards/tag_count_reward/mean": 0.9998046875,
	"rewards/tag_count_reward/std": 0.003125,
	"step": 285
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 481.0,
	"completions/max_terminated_length": 481.0,
	"completions/mean_length": 219.62421875,
	"completions/mean_terminated_length": 219.62421875,
	"completions/min_length": 91.0,
	"completions/min_terminated_length": 91.0,
	"epoch": 0.511238431026884,
	"grad_norm": 0.16876063412105669,
	"kl": 0.01141357421875,
	"learning_rate": 1e-06,
	"loss": 0.0033,
	"num_tokens": 61033560.0,
	"reward": 0.7902166962623596,
	"reward_std": 0.0687429528683424,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7531961083412171,
	"rewards/qatch_metrics/std": 0.37054654359817507,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 290
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 471.2,
	"completions/max_terminated_length": 471.2,
	"completions/mean_length": 226.275,
	"completions/mean_terminated_length": 226.275,
	"completions/min_length": 80.6,
	"completions/min_terminated_length": 80.6,
	"epoch": 0.5200528867342442,
	"grad_norm": 0.26818466602074054,
	"kl": 0.0130706787109375,
	"learning_rate": 1e-06,
	"loss": 0.0004,
	"num_tokens": 61786008.0,
	"reward": 0.7699209451675415,
	"reward_std": 0.07550354823470115,
	"rewards/format_reward/mean": 0.99921875,
	"rewards/format_reward/std": 0.0125,
	"rewards/qatch_metrics/mean": 0.7294221520423889,
	"rewards/qatch_metrics/std": 0.3492735385894775,
	"rewards/tag_count_reward/mean": 0.9998046875,
	"rewards/tag_count_reward/std": 0.003125,
	"step": 295
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 484.6,
	"completions/max_terminated_length": 484.6,
	"completions/mean_length": 247.2328125,
	"completions/mean_terminated_length": 247.2328125,
	"completions/min_length": 95.8,
	"completions/min_terminated_length": 95.8,
	"epoch": 0.5288673424416043,
	"grad_norm": 0.16485515882678206,
	"kl": 0.0113006591796875,
	"learning_rate": 1e-06,
	"loss": 0.0007,
	"num_tokens": 62590434.0,
	"reward": 0.8454334974288941,
	"reward_std": 0.0570029616355896,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8181570172309875,
	"rewards/qatch_metrics/std": 0.2992805689573288,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 300
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 480.6,
	"completions/max_terminated_length": 480.6,
	"completions/mean_length": 235.16015625,
	"completions/mean_terminated_length": 235.16015625,
	"completions/min_length": 88.0,
	"completions/min_terminated_length": 88.0,
	"epoch": 0.5376817981489643,
	"grad_norm": 0.27561378534620606,
	"kl": 0.01141510009765625,
	"learning_rate": 1e-06,
	"loss": 0.0097,
	"num_tokens": 63366287.0,
	"reward": 0.8380108118057251,
	"reward_std": 0.07530387155711651,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8094244718551635,
	"rewards/qatch_metrics/std": 0.30977231860160825,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 305
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 458.6,
	"completions/max_terminated_length": 458.6,
	"completions/mean_length": 215.646875,
	"completions/mean_terminated_length": 215.646875,
	"completions/min_length": 79.4,
	"completions/min_terminated_length": 79.4,
	"epoch": 0.5464962538563244,
	"grad_norm": 0.2018916915779266,
	"kl": 0.013714599609375,
	"learning_rate": 1e-06,
	"loss": -0.0045,
	"num_tokens": 64097387.0,
	"reward": 0.8135073184967041,
	"reward_std": 0.05950811579823494,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7806198120117187,
	"rewards/qatch_metrics/std": 0.33523867428302767,
	"rewards/tag_count_reward/mean": 0.999609375,
	"rewards/tag_count_reward/std": 0.00625,
	"step": 310
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 447.4,
	"completions/max_terminated_length": 447.4,
	"completions/mean_length": 223.68515625,
	"completions/mean_terminated_length": 223.68515625,
	"completions/min_length": 92.4,
	"completions/min_terminated_length": 92.4,
	"epoch": 0.5553107095636844,
	"grad_norm": 0.1836962735356692,
	"kl": 0.0138214111328125,
	"learning_rate": 1e-06,
	"loss": -0.0024,
	"num_tokens": 64869416.0,
	"reward": 0.8333834052085877,
	"reward_std": 0.07006162852048874,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8039804816246032,
	"rewards/qatch_metrics/std": 0.3219245493412018,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 315
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 504.8,
	"completions/max_terminated_length": 504.8,
	"completions/mean_length": 221.45390625,
	"completions/mean_terminated_length": 221.45390625,
	"completions/min_length": 80.0,
	"completions/min_terminated_length": 80.0,
	"epoch": 0.5641251652710445,
	"grad_norm": 0.23250178423343035,
	"kl": 0.01497802734375,
	"learning_rate": 1e-06,
	"loss": -0.0014,
	"num_tokens": 65613165.0,
	"reward": 0.8320096850395202,
	"reward_std": 0.053499556705355646,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8023643255233764,
	"rewards/qatch_metrics/std": 0.3343039393424988,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 320
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 466.6,
	"completions/max_terminated_length": 466.6,
	"completions/mean_length": 220.3984375,
	"completions/mean_terminated_length": 220.3984375,
	"completions/min_length": 72.6,
	"completions/min_terminated_length": 72.6,
	"epoch": 0.5729396209784046,
	"grad_norm": 0.09740281424559781,
	"kl": 0.0155609130859375,
	"learning_rate": 1e-06,
	"loss": -0.0012,
	"num_tokens": 66336475.0,
	"reward": 0.8796087980270386,
	"reward_std": 0.05236431676894426,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8583632946014405,
	"rewards/qatch_metrics/std": 0.2817832052707672,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 325
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 459.4,
	"completions/max_terminated_length": 459.4,
	"completions/mean_length": 225.27890625,
	"completions/mean_terminated_length": 225.27890625,
	"completions/min_length": 78.6,
	"completions/min_terminated_length": 78.6,
	"epoch": 0.5817540766857646,
	"grad_norm": 0.08354955287926201,
	"kl": 0.01513671875,
	"learning_rate": 1e-06,
	"loss": 0.0095,
	"num_tokens": 67098736.0,
	"reward": 0.8658102512359619,
	"reward_std": 0.07466748803853988,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8421296954154969,
	"rewards/qatch_metrics/std": 0.2614422976970673,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 330
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 447.2,
	"completions/max_terminated_length": 447.2,
	"completions/mean_length": 220.01875,
	"completions/mean_terminated_length": 220.01875,
	"completions/min_length": 81.8,
	"completions/min_terminated_length": 81.8,
	"epoch": 0.5905685323931247,
	"grad_norm": 0.20574209747901576,
	"kl": 0.015081787109375,
	"learning_rate": 1e-06,
	"loss": 0.011,
	"num_tokens": 67847928.0,
	"reward": 0.865822184085846,
	"reward_std": 0.046268445625901225,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8421437621116639,
	"rewards/qatch_metrics/std": 0.29589260220527647,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 335
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 432.0,
	"completions/max_terminated_length": 432.0,
	"completions/mean_length": 213.5953125,
	"completions/mean_terminated_length": 213.5953125,
	"completions/min_length": 76.0,
	"completions/min_terminated_length": 76.0,
	"epoch": 0.5993829881004848,
	"grad_norm": 0.2039975034177896,
	"kl": 0.0161651611328125,
	"learning_rate": 1e-06,
	"loss": 0.0066,
	"num_tokens": 68585234.0,
	"reward": 0.8343551635742188,
	"reward_std": 0.0688902921974659,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8051237106323242,
	"rewards/qatch_metrics/std": 0.30847290754318235,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 340
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 435.0,
	"completions/max_terminated_length": 435.0,
	"completions/mean_length": 203.69453125,
	"completions/mean_terminated_length": 203.69453125,
	"completions/min_length": 76.0,
	"completions/min_terminated_length": 76.0,
	"epoch": 0.6081974438078449,
	"grad_norm": 0.26848084439203446,
	"kl": 0.014788818359375,
	"learning_rate": 1e-06,
	"loss": 0.0023,
	"num_tokens": 69338379.0,
	"reward": 0.8848124146461487,
	"reward_std": 0.06373886093497276,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8644851684570313,
	"rewards/qatch_metrics/std": 0.26705425381660464,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 345
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 446.8,
	"completions/max_terminated_length": 446.8,
	"completions/mean_length": 221.1046875,
	"completions/mean_terminated_length": 221.1046875,
	"completions/min_length": 79.4,
	"completions/min_terminated_length": 79.4,
	"epoch": 0.617011899515205,
	"grad_norm": 0.2363792510293019,
	"kl": 0.019024658203125,
	"learning_rate": 1e-06,
	"loss": -0.002,
	"num_tokens": 70095473.0,
	"reward": 0.8130708336830139,
	"reward_std": 0.08477363213896752,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7800833344459533,
	"rewards/qatch_metrics/std": 0.3211198329925537,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 350
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 458.4,
	"completions/max_terminated_length": 458.4,
	"completions/mean_length": 234.00078125,
	"completions/mean_terminated_length": 234.00078125,
	"completions/min_length": 91.4,
	"completions/min_terminated_length": 91.4,
	"epoch": 0.625826355222565,
	"grad_norm": 0.1856420640121193,
	"kl": 0.019122314453125,
	"learning_rate": 1e-06,
	"loss": -0.0036,
	"num_tokens": 70860290.0,
	"reward": 0.8471660256385803,
	"reward_std": 0.0506692998111248,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8201953172683716,
	"rewards/qatch_metrics/std": 0.30663308799266814,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 355
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 452.8,
	"completions/max_terminated_length": 452.8,
	"completions/mean_length": 241.72421875,
	"completions/mean_terminated_length": 241.72421875,
	"completions/min_length": 88.0,
	"completions/min_terminated_length": 88.0,
	"epoch": 0.6346408109299251,
	"grad_norm": 0.22939974521057024,
	"kl": 0.01826171875,
	"learning_rate": 1e-06,
	"loss": 0.0127,
	"num_tokens": 71648401.0,
	"reward": 0.8702264785766601,
	"reward_std": 0.0592925101518631,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8473252534866333,
	"rewards/qatch_metrics/std": 0.28537269234657286,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 360
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 434.6,
	"completions/max_terminated_length": 434.6,
	"completions/mean_length": 215.853125,
	"completions/mean_terminated_length": 215.853125,
	"completions/min_length": 75.0,
	"completions/min_terminated_length": 75.0,
	"epoch": 0.6434552666372851,
	"grad_norm": 0.19883621919511643,
	"kl": 0.0163330078125,
	"learning_rate": 1e-06,
	"loss": 0.0073,
	"num_tokens": 72382693.0,
	"reward": 0.8091506719589233,
	"reward_std": 0.0635421834886074,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7754713773727417,
	"rewards/qatch_metrics/std": 0.3179103255271912,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 365
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 496.8,
	"completions/max_terminated_length": 496.8,
	"completions/mean_length": 213.1640625,
	"completions/mean_terminated_length": 213.1640625,
	"completions/min_length": 76.2,
	"completions/min_terminated_length": 76.2,
	"epoch": 0.6522697223446452,
	"grad_norm": 0.1916457590662772,
	"kl": 0.0175506591796875,
	"learning_rate": 1e-06,
	"loss": -0.0029,
	"num_tokens": 73161111.0,
	"reward": 0.8094798445701599,
	"reward_std": 0.04875086285173893,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7758586168289184,
	"rewards/qatch_metrics/std": 0.32606661319732666,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 370
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 480.4,
	"completions/max_terminated_length": 480.4,
	"completions/mean_length": 222.75,
	"completions/mean_terminated_length": 222.75,
	"completions/min_length": 72.6,
	"completions/min_terminated_length": 72.6,
	"epoch": 0.6610841780520053,
	"grad_norm": 0.15787517122504152,
	"kl": 0.0181884765625,
	"learning_rate": 1e-06,
	"loss": 0.0018,
	"num_tokens": 73905591.0,
	"reward": 0.89048171043396,
	"reward_std": 0.04932568361982703,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8711549639701843,
	"rewards/qatch_metrics/std": 0.2736783862113953,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 375
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 561.4,
	"completions/max_terminated_length": 561.4,
	"completions/mean_length": 234.4390625,
	"completions/mean_terminated_length": 234.4390625,
	"completions/min_length": 75.2,
	"completions/min_terminated_length": 75.2,
	"epoch": 0.6698986337593653,
	"grad_norm": 0.2653930596733297,
	"kl": 0.0174713134765625,
	"learning_rate": 1e-06,
	"loss": 0.003,
	"num_tokens": 74679801.0,
	"reward": 0.8243065714836121,
	"reward_std": 0.06958894729614258,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7933018207550049,
	"rewards/qatch_metrics/std": 0.3086866676807404,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 380
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 511.8,
	"completions/max_terminated_length": 511.8,
	"completions/mean_length": 243.73359375,
	"completions/mean_terminated_length": 243.73359375,
	"completions/min_length": 81.6,
	"completions/min_terminated_length": 81.6,
	"epoch": 0.6787130894667255,
	"grad_norm": 0.20233916054675122,
	"kl": 0.014093017578125,
	"learning_rate": 1e-06,
	"loss": 0.0048,
	"num_tokens": 75445892.0,
	"reward": 0.8653998494148254,
	"reward_std": 0.07132081612944603,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8416468739509583,
	"rewards/qatch_metrics/std": 0.3147186517715454,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 385
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 447.4,
	"completions/max_terminated_length": 447.4,
	"completions/mean_length": 228.43671875,
	"completions/mean_terminated_length": 228.43671875,
	"completions/min_length": 79.8,
	"completions/min_terminated_length": 79.8,
	"epoch": 0.6875275451740855,
	"grad_norm": 0.29996778931865303,
	"kl": 0.0146087646484375,
	"learning_rate": 1e-06,
	"loss": -0.0016,
	"num_tokens": 76229251.0,
	"reward": 0.8502862334251404,
	"reward_std": 0.07314281612634659,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8238661646842956,
	"rewards/qatch_metrics/std": 0.3113024443387985,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 390
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 487.0,
	"completions/max_terminated_length": 487.0,
	"completions/mean_length": 249.8734375,
	"completions/mean_terminated_length": 249.8734375,
	"completions/min_length": 84.4,
	"completions/min_terminated_length": 84.4,
	"epoch": 0.6963420008814456,
	"grad_norm": 0.2150032953896288,
	"kl": 0.017156982421875,
	"learning_rate": 1e-06,
	"loss": -0.0029,
	"num_tokens": 77021793.0,
	"reward": 0.8494030237197876,
	"reward_std": 0.05776047557592392,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8228270888328553,
	"rewards/qatch_metrics/std": 0.3020846724510193,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 395
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 471.4,
	"completions/max_terminated_length": 471.4,
	"completions/mean_length": 247.9828125,
	"completions/mean_terminated_length": 247.9828125,
	"completions/min_length": 84.4,
	"completions/min_terminated_length": 84.4,
	"epoch": 0.7051564565888057,
	"grad_norm": 0.2754041387856829,
	"kl": 0.0148590087890625,
	"learning_rate": 1e-06,
	"loss": 0.0065,
	"num_tokens": 77833451.0,
	"reward": 0.8363431453704834,
	"reward_std": 0.06054745838046074,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.807462501525879,
	"rewards/qatch_metrics/std": 0.29668720066547394,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 400
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 442.6,
	"completions/max_terminated_length": 442.6,
	"completions/mean_length": 225.4296875,
	"completions/mean_terminated_length": 225.4296875,
	"completions/min_length": 83.0,
	"completions/min_terminated_length": 83.0,
	"epoch": 0.7139709122961657,
	"grad_norm": 0.22420011771594078,
	"kl": 0.017706298828125,
	"learning_rate": 1e-06,
	"loss": 0.0019,
	"num_tokens": 78585793.0,
	"reward": 0.8382049560546875,
	"reward_std": 0.05150428526103497,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8096528768539428,
	"rewards/qatch_metrics/std": 0.2925006330013275,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 405
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 618.4,
	"completions/max_terminated_length": 618.4,
	"completions/mean_length": 219.3421875,
	"completions/mean_terminated_length": 219.3421875,
	"completions/min_length": 84.6,
	"completions/min_terminated_length": 84.6,
	"epoch": 0.7227853680035258,
	"grad_norm": 0.0986589707089894,
	"kl": 0.0170196533203125,
	"learning_rate": 1e-06,
	"loss": 0.0022,
	"num_tokens": 79352135.0,
	"reward": 0.8465274453163147,
	"reward_std": 0.05231629386544227,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8194440126419067,
	"rewards/qatch_metrics/std": 0.3004340440034866,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 410
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 470.6,
	"completions/max_terminated_length": 470.6,
	"completions/mean_length": 213.9921875,
	"completions/mean_terminated_length": 213.9921875,
	"completions/min_length": 83.6,
	"completions/min_terminated_length": 83.6,
	"epoch": 0.7315998237108858,
	"grad_norm": 0.17969166348358623,
	"kl": 0.01600341796875,
	"learning_rate": 1e-06,
	"loss": 0.0196,
	"num_tokens": 80093021.0,
	"reward": 0.7899853944778442,
	"reward_std": 0.06183199286460876,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7529239773750305,
	"rewards/qatch_metrics/std": 0.32831716537475586,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 415
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 458.4,
	"completions/max_terminated_length": 458.4,
	"completions/mean_length": 208.25390625,
	"completions/mean_terminated_length": 208.25390625,
	"completions/min_length": 72.6,
	"completions/min_terminated_length": 72.6,
	"epoch": 0.7404142794182459,
	"grad_norm": 0.12360613268228073,
	"kl": 0.0170166015625,
	"learning_rate": 1e-06,
	"loss": -0.0011,
	"num_tokens": 80810658.0,
	"reward": 0.8781363725662231,
	"reward_std": 0.04314489997923374,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8566309928894043,
	"rewards/qatch_metrics/std": 0.2832080274820328,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 420
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 447.2,
	"completions/max_terminated_length": 447.2,
	"completions/mean_length": 203.98046875,
	"completions/mean_terminated_length": 203.98046875,
	"completions/min_length": 79.2,
	"completions/min_terminated_length": 79.2,
	"epoch": 0.749228735125606,
	"grad_norm": 0.210810313322166,
	"kl": 0.0164581298828125,
	"learning_rate": 1e-06,
	"loss": 0.003,
	"num_tokens": 81548361.0,
	"reward": 0.8270991563796997,
	"reward_std": 0.06941422820091248,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7965872406959533,
	"rewards/qatch_metrics/std": 0.33117216229438784,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 425
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 434.6,
	"completions/max_terminated_length": 434.6,
	"completions/mean_length": 220.75703125,
	"completions/mean_terminated_length": 220.75703125,
	"completions/min_length": 80.4,
	"completions/min_terminated_length": 80.4,
	"epoch": 0.7580431908329661,
	"grad_norm": 0.21910688267881026,
	"kl": 0.016754150390625,
	"learning_rate": 1e-06,
	"loss": -0.0012,
	"num_tokens": 82290706.0,
	"reward": 0.8464880228042603,
	"reward_std": 0.04884184449911118,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8193976640701294,
	"rewards/qatch_metrics/std": 0.28375020921230315,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 430
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 451.8,
	"completions/max_terminated_length": 451.8,
	"completions/mean_length": 223.1234375,
	"completions/mean_terminated_length": 223.1234375,
	"completions/min_length": 85.0,
	"completions/min_terminated_length": 85.0,
	"epoch": 0.7668576465403262,
	"grad_norm": 0.26253720274856984,
	"kl": 0.0178009033203125,
	"learning_rate": 1e-06,
	"loss": -0.0023,
	"num_tokens": 83056976.0,
	"reward": 0.8096219301223755,
	"reward_std": 0.07494284212589264,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7760257959365845,
	"rewards/qatch_metrics/std": 0.3492628037929535,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 435
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 501.2,
	"completions/max_terminated_length": 501.2,
	"completions/mean_length": 216.840625,
	"completions/mean_terminated_length": 216.840625,
	"completions/min_length": 88.8,
	"completions/min_terminated_length": 88.8,
	"epoch": 0.7756721022476862,
	"grad_norm": 0.27647079947407377,
	"kl": 0.0181732177734375,
	"learning_rate": 1e-06,
	"loss": 0.0012,
	"num_tokens": 83805044.0,
	"reward": 0.7776495218276978,
	"reward_std": 0.056884029135108,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7384112119674683,
	"rewards/qatch_metrics/std": 0.3683965981006622,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 440
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 455.0,
	"completions/max_terminated_length": 455.0,
	"completions/mean_length": 216.46015625,
	"completions/mean_terminated_length": 216.46015625,
	"completions/min_length": 78.2,
	"completions/min_terminated_length": 78.2,
	"epoch": 0.7844865579550463,
	"grad_norm": 0.20996305667402082,
	"kl": 0.0163116455078125,
	"learning_rate": 1e-06,
	"loss": 0.0063,
	"num_tokens": 84571313.0,
	"reward": 0.8477118849754334,
	"reward_std": 0.06959039457142353,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8208375215530396,
	"rewards/qatch_metrics/std": 0.30095059871673585,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 445
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 443.4,
	"completions/max_terminated_length": 443.4,
	"completions/mean_length": 211.00234375,
	"completions/mean_terminated_length": 211.00234375,
	"completions/min_length": 86.2,
	"completions/min_terminated_length": 86.2,
	"epoch": 0.7933010136624064,
	"grad_norm": 0.15662206787116065,
	"kl": 0.0160797119140625,
	"learning_rate": 1e-06,
	"loss": 0.0004,
	"num_tokens": 85319188.0,
	"reward": 0.8328658938407898,
	"reward_std": 0.05801869332790375,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8033716320991516,
	"rewards/qatch_metrics/std": 0.3037038058042526,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 450
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 475.2,
	"completions/max_terminated_length": 475.2,
	"completions/mean_length": 222.7078125,
	"completions/mean_terminated_length": 222.7078125,
	"completions/min_length": 80.8,
	"completions/min_terminated_length": 80.8,
	"epoch": 0.8021154693697664,
	"grad_norm": 0.19919629119501958,
	"kl": 0.01639404296875,
	"learning_rate": 1e-06,
	"loss": 0.001,
	"num_tokens": 86091774.0,
	"reward": 0.8358211517333984,
	"reward_std": 0.0607087716460228,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8068713665008544,
	"rewards/qatch_metrics/std": 0.30334635376930236,
	"rewards/tag_count_reward/mean": 0.999609375,
	"rewards/tag_count_reward/std": 0.00625,
	"step": 455
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 418.4,
	"completions/max_terminated_length": 418.4,
	"completions/mean_length": 221.0015625,
	"completions/mean_terminated_length": 221.0015625,
	"completions/min_length": 80.4,
	"completions/min_terminated_length": 80.4,
	"epoch": 0.8109299250771265,
	"grad_norm": 0.1419366062228353,
	"kl": 0.01617431640625,
	"learning_rate": 1e-06,
	"loss": 0.0031,
	"num_tokens": 86848528.0,
	"reward": 0.8028954148292542,
	"reward_std": 0.06934207193553447,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7681122660636902,
	"rewards/qatch_metrics/std": 0.3390295565128326,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 460
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 426.6,
	"completions/max_terminated_length": 426.6,
	"completions/mean_length": 210.6140625,
	"completions/mean_terminated_length": 210.6140625,
	"completions/min_length": 85.0,
	"completions/min_terminated_length": 85.0,
	"epoch": 0.8197443807844865,
	"grad_norm": 0.16116384181364513,
	"kl": 0.0162078857421875,
	"learning_rate": 1e-06,
	"loss": 0.013,
	"num_tokens": 87564482.0,
	"reward": 0.8424649000167846,
	"reward_std": 0.040234316140413284,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8146645903587342,
	"rewards/qatch_metrics/std": 0.2840981811285019,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 465
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 447.6,
	"completions/max_terminated_length": 447.6,
	"completions/mean_length": 195.89140625,
	"completions/mean_terminated_length": 195.89140625,
	"completions/min_length": 80.8,
	"completions/min_terminated_length": 80.8,
	"epoch": 0.8285588364918466,
	"grad_norm": 0.21075371504226795,
	"kl": 0.0193115234375,
	"learning_rate": 1e-06,
	"loss": 0.001,
	"num_tokens": 88255159.0,
	"reward": 0.8565711379051208,
	"reward_std": 0.06344871073961258,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8312601566314697,
	"rewards/qatch_metrics/std": 0.3075568675994873,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 470
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 445.4,
	"completions/max_terminated_length": 445.4,
	"completions/mean_length": 201.01484375,
	"completions/mean_terminated_length": 201.01484375,
	"completions/min_length": 85.0,
	"completions/min_terminated_length": 85.0,
	"epoch": 0.8373732921992068,
	"grad_norm": 0.27204162033836665,
	"kl": 0.019976806640625,
	"learning_rate": 1e-06,
	"loss": -0.0009,
	"num_tokens": 88945690.0,
	"reward": 0.8785177230834961,
	"reward_std": 0.06470721438527108,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8570796966552734,
	"rewards/qatch_metrics/std": 0.2825317859649658,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 475
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 497.4,
	"completions/max_terminated_length": 497.4,
	"completions/mean_length": 221.90859375,
	"completions/mean_terminated_length": 221.90859375,
	"completions/min_length": 81.2,
	"completions/min_terminated_length": 81.2,
	"epoch": 0.8461877479065668,
	"grad_norm": 0.19323853705899263,
	"kl": 0.0183746337890625,
	"learning_rate": 1e-06,
	"loss": 0.0041,
	"num_tokens": 89712373.0,
	"reward": 0.8555493712425232,
	"reward_std": 0.06230065375566483,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8300580739974975,
	"rewards/qatch_metrics/std": 0.28706649839878084,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 480
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 498.8,
	"completions/max_terminated_length": 498.8,
	"completions/mean_length": 228.7890625,
	"completions/mean_terminated_length": 228.7890625,
	"completions/min_length": 88.4,
	"completions/min_terminated_length": 88.4,
	"epoch": 0.8550022036139269,
	"grad_norm": 0.24770714763886528,
	"kl": 0.0176513671875,
	"learning_rate": 1e-06,
	"loss": 0.004,
	"num_tokens": 90520071.0,
	"reward": 0.8527018785476684,
	"reward_std": 0.062195781618356705,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8267080783843994,
	"rewards/qatch_metrics/std": 0.2996180385351181,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 485
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 480.6,
	"completions/max_terminated_length": 480.6,
	"completions/mean_length": 227.66875,
	"completions/mean_terminated_length": 227.66875,
	"completions/min_length": 83.2,
	"completions/min_terminated_length": 83.2,
	"epoch": 0.8638166593212869,
	"grad_norm": 0.16162980170931898,
	"kl": 0.0188812255859375,
	"learning_rate": 1e-06,
	"loss": 0.0006,
	"num_tokens": 91278479.0,
	"reward": 0.8309607028961181,
	"reward_std": 0.0656251635402441,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8011302351951599,
	"rewards/qatch_metrics/std": 0.31802850365638735,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 490
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 502.4,
	"completions/max_terminated_length": 502.4,
	"completions/mean_length": 225.3171875,
	"completions/mean_terminated_length": 225.3171875,
	"completions/min_length": 80.4,
	"completions/min_terminated_length": 80.4,
	"epoch": 0.872631115028647,
	"grad_norm": 0.1886973597841831,
	"kl": 0.01859130859375,
	"learning_rate": 1e-06,
	"loss": 0.0052,
	"num_tokens": 92033173.0,
	"reward": 0.8441248655319213,
	"reward_std": 0.043570340052247046,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8166174769401551,
	"rewards/qatch_metrics/std": 0.30278873145580293,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 495
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 443.6,
	"completions/max_terminated_length": 443.6,
	"completions/mean_length": 234.475,
	"completions/mean_terminated_length": 234.475,
	"completions/min_length": 99.8,
	"completions/min_terminated_length": 99.8,
	"epoch": 0.881445570736007,
	"grad_norm": 0.24444756963754977,
	"kl": 0.01798095703125,
	"learning_rate": 1e-06,
	"loss": 0.007,
	"num_tokens": 92808293.0,
	"reward": 0.8517020106315613,
	"reward_std": 0.06295906975865365,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.825531804561615,
	"rewards/qatch_metrics/std": 0.3100520223379135,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 500
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 440.0,
	"completions/max_terminated_length": 440.0,
	"completions/mean_length": 215.0328125,
	"completions/mean_terminated_length": 215.0328125,
	"completions/min_length": 84.2,
	"completions/min_terminated_length": 84.2,
	"epoch": 0.8902600264433671,
	"grad_norm": 0.21103775626066984,
	"kl": 0.0171600341796875,
	"learning_rate": 1e-06,
	"loss": 0.0051,
	"num_tokens": 93563327.0,
	"reward": 0.8682243466377259,
	"reward_std": 0.04365142099559307,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8449697852134704,
	"rewards/qatch_metrics/std": 0.2696381151676178,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 505
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 463.4,
	"completions/max_terminated_length": 463.4,
	"completions/mean_length": 218.59453125,
	"completions/mean_terminated_length": 218.59453125,
	"completions/min_length": 77.0,
	"completions/min_terminated_length": 77.0,
	"epoch": 0.8990744821507272,
	"grad_norm": 0.20107359914643413,
	"kl": 0.016455078125,
	"learning_rate": 1e-06,
	"loss": 0.0086,
	"num_tokens": 94333288.0,
	"reward": 0.8064153909683227,
	"reward_std": 0.06192653328180313,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.772253406047821,
	"rewards/qatch_metrics/std": 0.3227865040302277,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 510
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 443.0,
	"completions/max_terminated_length": 443.0,
	"completions/mean_length": 206.5703125,
	"completions/mean_terminated_length": 206.5703125,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"epoch": 0.9078889378580872,
	"grad_norm": 0.10741725097461949,
	"kl": 0.0163330078125,
	"learning_rate": 1e-06,
	"loss": 0.0056,
	"num_tokens": 95051890.0,
	"reward": 0.8839513182640075,
	"reward_std": 0.04564618114382028,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8634721517562867,
	"rewards/qatch_metrics/std": 0.24794530421495437,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 515
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 420.2,
	"completions/max_terminated_length": 420.2,
	"completions/mean_length": 193.584375,
	"completions/mean_terminated_length": 193.584375,
	"completions/min_length": 74.8,
	"completions/min_terminated_length": 74.8,
	"epoch": 0.9167033935654474,
	"grad_norm": 0.3417922303720187,
	"kl": 0.0196563720703125,
	"learning_rate": 1e-06,
	"loss": 0.0033,
	"num_tokens": 95755150.0,
	"reward": 0.8428452134132385,
	"reward_std": 0.05727057494223118,
	"rewards/format_reward/mean": 0.99921875,
	"rewards/format_reward/std": 0.0125,
	"rewards/qatch_metrics/mean": 0.8152039051055908,
	"rewards/qatch_metrics/std": 0.31376497745513915,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 520
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 404.8,
	"completions/max_terminated_length": 404.8,
	"completions/mean_length": 208.72890625,
	"completions/mean_terminated_length": 208.72890625,
	"completions/min_length": 72.2,
	"completions/min_terminated_length": 72.2,
	"epoch": 0.9255178492728074,
	"grad_norm": 0.17161657062686406,
	"kl": 0.0185943603515625,
	"learning_rate": 1e-06,
	"loss": -0.0023,
	"num_tokens": 96514835.0,
	"reward": 0.8597602009773254,
	"reward_std": 0.044371549785137174,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8350119948387146,
	"rewards/qatch_metrics/std": 0.295586758852005,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 525
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 426.8,
	"completions/max_terminated_length": 426.8,
	"completions/mean_length": 212.95859375,
	"completions/mean_terminated_length": 212.95859375,
	"completions/min_length": 77.0,
	"completions/min_terminated_length": 77.0,
	"epoch": 0.9343323049801675,
	"grad_norm": 0.22162383692372334,
	"kl": 0.0186981201171875,
	"learning_rate": 1e-06,
	"loss": -0.002,
	"num_tokens": 97270782.0,
	"reward": 0.8363440155982971,
	"reward_std": 0.06691965609788894,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8074635624885559,
	"rewards/qatch_metrics/std": 0.3064163327217102,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 530
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 454.6,
	"completions/max_terminated_length": 454.6,
	"completions/mean_length": 233.40234375,
	"completions/mean_terminated_length": 233.40234375,
	"completions/min_length": 76.4,
	"completions/min_terminated_length": 76.4,
	"epoch": 0.9431467606875276,
	"grad_norm": 0.1434511776519399,
	"kl": 0.019879150390625,
	"learning_rate": 1e-06,
	"loss": 0.0023,
	"num_tokens": 98016705.0,
	"reward": 0.8363542199134827,
	"reward_std": 0.05200971700251102,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8074755430221557,
	"rewards/qatch_metrics/std": 0.2885085940361023,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 535
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 444.6,
	"completions/max_terminated_length": 444.6,
	"completions/mean_length": 235.70625,
	"completions/mean_terminated_length": 235.70625,
	"completions/min_length": 80.6,
	"completions/min_terminated_length": 80.6,
	"epoch": 0.9519612163948876,
	"grad_norm": 0.09221258199209693,
	"kl": 0.018701171875,
	"learning_rate": 1e-06,
	"loss": 0.0038,
	"num_tokens": 98787193.0,
	"reward": 0.8677037119865417,
	"reward_std": 0.057669999450445174,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8443572998046875,
	"rewards/qatch_metrics/std": 0.288933590054512,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 540
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 455.2,
	"completions/max_terminated_length": 455.2,
	"completions/mean_length": 222.11875,
	"completions/mean_terminated_length": 222.11875,
	"completions/min_length": 74.6,
	"completions/min_terminated_length": 74.6,
	"epoch": 0.9607756721022477,
	"grad_norm": 0.1352237905149159,
	"kl": 0.018145751953125,
	"learning_rate": 1e-06,
	"loss": -0.0031,
	"num_tokens": 99532081.0,
	"reward": 0.8805891752243042,
	"reward_std": 0.05483146589249373,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8595166802406311,
	"rewards/qatch_metrics/std": 0.25585181415081026,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 545
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 472.6,
	"completions/max_terminated_length": 472.6,
	"completions/mean_length": 218.659375,
	"completions/mean_terminated_length": 218.659375,
	"completions/min_length": 86.2,
	"completions/min_terminated_length": 86.2,
	"epoch": 0.9695901278096077,
	"grad_norm": 0.16904630982662794,
	"kl": 0.01783447265625,
	"learning_rate": 1e-06,
	"loss": 0.0016,
	"num_tokens": 100246573.0,
	"reward": 0.8569401383399964,
	"reward_std": 0.07272802218794823,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8316942691802979,
	"rewards/qatch_metrics/std": 0.3041912466287613,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 550
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 459.8,
	"completions/max_terminated_length": 459.8,
	"completions/mean_length": 221.78984375,
	"completions/mean_terminated_length": 221.78984375,
	"completions/min_length": 77.8,
	"completions/min_terminated_length": 77.8,
	"epoch": 0.9784045835169678,
	"grad_norm": 0.31854687165087076,
	"kl": 0.0183258056640625,
	"learning_rate": 1e-06,
	"loss": -0.0058,
	"num_tokens": 100996640.0,
	"reward": 0.8102917551994324,
	"reward_std": 0.07570969834923744,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.7768138289451599,
	"rewards/qatch_metrics/std": 0.34436498284339906,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 555
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 490.6,
	"completions/max_terminated_length": 490.6,
	"completions/mean_length": 230.28828125,
	"completions/mean_terminated_length": 230.28828125,
	"completions/min_length": 85.0,
	"completions/min_terminated_length": 85.0,
	"epoch": 0.9872190392243279,
	"grad_norm": 0.16545798735816303,
	"kl": 0.01719970703125,
	"learning_rate": 1e-06,
	"loss": 0.0054,
	"num_tokens": 101777473.0,
	"reward": 0.854366683959961,
	"reward_std": 0.050544672086834906,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.8286666750907898,
	"rewards/qatch_metrics/std": 0.3027670204639435,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 560
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 460.0,
	"completions/max_terminated_length": 460.0,
	"completions/mean_length": 232.278125,
	"completions/mean_terminated_length": 232.278125,
	"completions/min_length": 79.2,
	"completions/min_terminated_length": 79.2,
	"epoch": 0.996033494931688,
	"grad_norm": 0.2064718967348405,
	"kl": 0.020306396484375,
	"learning_rate": 1e-06,
	"loss": -0.0052,
	"num_tokens": 102547669.0,
	"reward": 0.7918175339698792,
	"reward_std": 0.05684706475585699,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.755079448223114,
	"rewards/qatch_metrics/std": 0.3250477254390717,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 565
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 468.5,
	"completions/max_terminated_length": 468.5,
	"completions/mean_length": 214.265625,
	"completions/mean_terminated_length": 214.265625,
	"completions/min_length": 66.0,
	"completions/min_terminated_length": 66.0,
	"epoch": 0.999559277214632,
	"kl": 0.01806640625,
	"num_tokens": 102823629.0,
	"reward": 0.8797399699687958,
	"reward_std": 0.056224397383630276,
	"rewards/format_reward/mean": 1.0,
	"rewards/format_reward/std": 0.0,
	"rewards/qatch_metrics/mean": 0.858517587184906,
	"rewards/qatch_metrics/std": 0.26497258245944977,
	"rewards/tag_count_reward/mean": 1.0,
	"rewards/tag_count_reward/std": 0.0,
	"step": 567,
	"total_flos": 0.0,
	"train_loss": -1.6490349831877564e-05,
	"train_runtime": 5804.9117,
	"train_samples_per_second": 1.564,
	"train_steps_per_second": 0.098
	}
	],
	"logging_steps": 5,
	"max_steps": 567,
	"num_input_tokens_seen": 102823629,
	"num_train_epochs": 1,
	"save_steps": 5,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 16,
	"trial_name": null,
	"trial_params": null
	}