diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3303 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999559277214632, + "eval_steps": 500, + "global_step": 567, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.0078125, + "epoch": 0.0017628911414720142, + "grad_norm": 3.0840760424820304, + "kl": 0.0, + "learning_rate": 1.7543859649122805e-08, + "loss": -0.0042, + "max_completion_length": 464.0, + "max_terminated_completion_length": 459.75, + "mean_completion_length": 120.04296875, + "mean_terminated_completion_length": 118.23617553710938, + "min_completion_length": 21.0, + "min_terminated_completion_length": 21.0, + "num_tokens": 115211.0, + "reward": 0.25845247507095337, + "reward_std": 0.24694325402379036, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.2897005267441273, + "rewards/qatch_metrics/std": 0.37457581236958504, + "rewards/tag_count_reward/mean": 0.244140625, + "rewards/tag_count_reward/std": 0.13581550493836403, + "step": 1 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.0107421875, + "epoch": 0.00881445570736007, + "grad_norm": 2.803414301509709, + "kl": 0.00023472309112548828, + "learning_rate": 8.771929824561403e-08, + "loss": 0.0299, + "max_completion_length": 2148.625, + "max_terminated_completion_length": 560.25, + "mean_completion_length": 164.86328125, + "mean_terminated_completion_length": 122.24907398223877, + "min_completion_length": 23.3125, + "min_terminated_completion_length": 23.3125, + "num_tokens": 658751.0, + "reward": 0.14076079020742327, + "reward_std": 0.17357240640558302, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.15179980779066682, + "rewards/qatch_metrics/std": 0.285268085077405, + "rewards/tag_count_reward/mean": 0.234619140625, + "rewards/tag_count_reward/std": 0.11686475621536374, + "step": 5 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.01015625, + "epoch": 0.01762891141472014, + "grad_norm": 2.555865364836083, + "kl": 0.00034499168395996094, + "learning_rate": 1.7543859649122805e-07, + "loss": 0.0691, + "max_completion_length": 1847.5, + "max_terminated_completion_length": 736.65, + "mean_completion_length": 154.3125, + "mean_terminated_completion_length": 130.3932632446289, + "min_completion_length": 21.3, + "min_terminated_completion_length": 21.3, + "num_tokens": 1347167.0, + "reward": 0.12399922087788581, + "reward_std": 0.16416746266186238, + "rewards/format_reward/mean": 0.00078125, + "rewards/format_reward/std": 0.00625, + "rewards/qatch_metrics/mean": 0.13194531546905636, + "rewards/qatch_metrics/std": 0.2816271550953388, + "rewards/tag_count_reward/mean": 0.2353515625, + "rewards/tag_count_reward/std": 0.11751417592167854, + "step": 10 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.01328125, + "epoch": 0.026443367122080213, + "grad_norm": 1.8510134641989897, + "kl": 0.00035467147827148435, + "learning_rate": 2.631578947368421e-07, + "loss": 0.1161, + "max_completion_length": 1736.25, + "max_terminated_completion_length": 725.25, + "mean_completion_length": 166.71015625, + "mean_terminated_completion_length": 125.76015815734863, + "min_completion_length": 22.4, + "min_terminated_completion_length": 22.4, + "num_tokens": 1999516.0, + "reward": 0.13328830637037753, + "reward_std": 0.18279453851282595, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.1429885433986783, + "rewards/qatch_metrics/std": 0.2840505912899971, + "rewards/tag_count_reward/mean": 0.2349609375, + "rewards/tag_count_reward/std": 0.12176873050630092, + "step": 15 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.0109375, + "epoch": 0.03525782282944028, + "grad_norm": 2.5276083322305163, + "kl": 0.0008690834045410156, + "learning_rate": 3.508771929824561e-07, + "loss": 0.0909, + "max_completion_length": 1950.25, + "max_terminated_completion_length": 694.35, + "mean_completion_length": 157.1859375, + "mean_terminated_completion_length": 122.76734085083008, + "min_completion_length": 22.95, + "min_terminated_completion_length": 22.95, + "num_tokens": 2662298.0, + "reward": 0.19042691607028245, + "reward_std": 0.19785099737346173, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.20991172092035412, + "rewards/qatch_metrics/std": 0.32454456612467764, + "rewards/tag_count_reward/mean": 0.2400390625, + "rewards/tag_count_reward/std": 0.11891286894679069, + "step": 20 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.00390625, + "epoch": 0.044072278536800354, + "grad_norm": 1.6421641387454096, + "kl": 0.002597618103027344, + "learning_rate": 4.3859649122807013e-07, + "loss": 0.0598, + "max_completion_length": 1248.55, + "max_terminated_completion_length": 525.75, + "mean_completion_length": 117.29296875, + "mean_terminated_completion_length": 104.7642993927002, + "min_completion_length": 21.5, + "min_terminated_completion_length": 21.5, + "num_tokens": 3274689.0, + "reward": 0.19326679892838, + "reward_std": 0.19503218345344067, + "rewards/format_reward/mean": 0.00234375, + "rewards/format_reward/std": 0.01875, + "rewards/qatch_metrics/mean": 0.21241406723856926, + "rewards/qatch_metrics/std": 0.3405905418097973, + "rewards/tag_count_reward/mean": 0.249609375, + "rewards/tag_count_reward/std": 0.11380729898810386, + "step": 25 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.00546875, + "epoch": 0.052886734244160426, + "grad_norm": 2.327163237963173, + "kl": 0.004604721069335937, + "learning_rate": 5.263157894736842e-07, + "loss": -0.0155, + "max_completion_length": 1373.3, + "max_terminated_completion_length": 461.75, + "mean_completion_length": 126.6109375, + "mean_terminated_completion_length": 110.83255004882812, + "min_completion_length": 21.45, + "min_terminated_completion_length": 21.45, + "num_tokens": 3903935.0, + "reward": 0.235529076308012, + "reward_std": 0.19208679497241973, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.26164037082344294, + "rewards/qatch_metrics/std": 0.35552939809858797, + "rewards/tag_count_reward/mean": 0.2626953125, + "rewards/tag_count_reward/std": 0.12861518152058124, + "step": 30 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.003125, + "epoch": 0.06170118995152049, + "grad_norm": 1.9030107685120632, + "kl": 0.006869125366210938, + "learning_rate": 6.140350877192982e-07, + "loss": -0.0291, + "max_completion_length": 637.45, + "max_terminated_completion_length": 475.05, + "mean_completion_length": 110.1609375, + "mean_terminated_completion_length": 106.66331939697265, + "min_completion_length": 19.1, + "min_terminated_completion_length": 19.1, + "num_tokens": 4489885.0, + "reward": 0.23229851759970188, + "reward_std": 0.21139583457261324, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.2574031319469213, + "rewards/qatch_metrics/std": 0.3460362754762173, + "rewards/tag_count_reward/mean": 0.2701171875, + "rewards/tag_count_reward/std": 0.12943687103688717, + "step": 35 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.0015625, + "epoch": 0.07051564565888056, + "grad_norm": 1.3133674580676309, + "kl": 0.006020355224609375, + "learning_rate": 7.017543859649122e-07, + "loss": -0.0528, + "max_completion_length": 738.35, + "max_terminated_completion_length": 558.2, + "mean_completion_length": 145.91328125, + "mean_terminated_completion_length": 142.51878776550294, + "min_completion_length": 21.1, + "min_terminated_completion_length": 21.1, + "num_tokens": 5151902.0, + "reward": 0.22980495262891054, + "reward_std": 0.18255550526082515, + "rewards/format_reward/mean": 0.00078125, + "rewards/format_reward/std": 0.00625, + "rewards/qatch_metrics/mean": 0.25320573393255474, + "rewards/qatch_metrics/std": 0.3738796763122082, + "rewards/tag_count_reward/mean": 0.2900390625, + "rewards/tag_count_reward/std": 0.15516266897320746, + "step": 40 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.0015625, + "epoch": 0.07933010136624064, + "grad_norm": 1.065001099510075, + "kl": 0.005857086181640625, + "learning_rate": 7.894736842105263e-07, + "loss": 0.0403, + "max_completion_length": 674.15, + "max_terminated_completion_length": 669.1, + "mean_completion_length": 172.17578125, + "mean_terminated_completion_length": 171.6076644897461, + "min_completion_length": 20.2, + "min_terminated_completion_length": 20.2, + "num_tokens": 5870847.0, + "reward": 0.23959124982357025, + "reward_std": 0.21271923929452896, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/qatch_metrics/mean": 0.26344375535845754, + "rewards/qatch_metrics/std": 0.36381270438432695, + "rewards/tag_count_reward/mean": 0.31328125, + "rewards/tag_count_reward/std": 0.16481443196535112, + "step": 45 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.00390625, + "epoch": 0.08814455707360071, + "grad_norm": 1.4568973490349373, + "kl": 0.005655670166015625, + "learning_rate": 8.771929824561403e-07, + "loss": -0.0555, + "max_completion_length": 1026.65, + "max_terminated_completion_length": 500.6, + "mean_completion_length": 171.38203125, + "mean_terminated_completion_length": 158.90636672973633, + "min_completion_length": 20.45, + "min_terminated_completion_length": 20.45, + "num_tokens": 6568024.0, + "reward": 0.22149190343916417, + "reward_std": 0.21185415983200073, + "rewards/format_reward/mean": 0.00390625, + "rewards/format_reward/std": 0.027518405020236968, + "rewards/qatch_metrics/mean": 0.24082917235791684, + "rewards/qatch_metrics/std": 0.36300159245729446, + "rewards/tag_count_reward/mean": 0.3279296875, + "rewards/tag_count_reward/std": 0.16671581640839578, + "step": 50 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.00390625, + "epoch": 0.09695901278096078, + "grad_norm": 1.0074202050705572, + "kl": 0.007281494140625, + "learning_rate": 9.649122807017545e-07, + "loss": 0.0364, + "max_completion_length": 1240.75, + "max_terminated_completion_length": 517.85, + "mean_completion_length": 194.9171875, + "mean_terminated_completion_length": 182.60486450195313, + "min_completion_length": 23.85, + "min_terminated_completion_length": 23.85, + "num_tokens": 7285134.0, + "reward": 0.2235423892736435, + "reward_std": 0.21293668523430825, + "rewards/format_reward/mean": 0.00859375, + "rewards/format_reward/std": 0.053823620080947876, + "rewards/qatch_metrics/mean": 0.24047266095876693, + "rewards/qatch_metrics/std": 0.35300029441714287, + "rewards/tag_count_reward/mean": 0.365625, + "rewards/tag_count_reward/std": 0.17709428519010545, + "step": 55 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.00625, + "epoch": 0.10577346848832085, + "grad_norm": 0.9807961478644736, + "kl": 0.01302337646484375, + "learning_rate": 1e-06, + "loss": -0.0139, + "max_completion_length": 1103.85, + "max_terminated_completion_length": 560.8, + "mean_completion_length": 248.34296875, + "mean_terminated_completion_length": 238.09479522705078, + "min_completion_length": 38.0, + "min_terminated_completion_length": 38.0, + "num_tokens": 8039813.0, + "reward": 0.23491878062486649, + "reward_std": 0.21809776537120343, + "rewards/format_reward/mean": 0.01953125, + "rewards/format_reward/std": 0.11232657507061958, + "rewards/qatch_metrics/mean": 0.24687135666608812, + "rewards/qatch_metrics/std": 0.3717411242425442, + "rewards/tag_count_reward/mean": 0.4625, + "rewards/tag_count_reward/std": 0.2135901317000389, + "step": 60 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.00234375, + "epoch": 0.11458792419568092, + "grad_norm": 0.8603200985963705, + "kl": 0.01912841796875, + "learning_rate": 1e-06, + "loss": 0.0333, + "max_completion_length": 497.9, + "max_terminated_completion_length": 495.0, + "mean_completion_length": 230.2484375, + "mean_terminated_completion_length": 229.92258987426757, + "min_completion_length": 42.5, + "min_terminated_completion_length": 42.5, + "num_tokens": 8806307.0, + "reward": 0.24188727661967277, + "reward_std": 0.216167426854372, + "rewards/format_reward/mean": 0.0640625, + "rewards/format_reward/std": 0.22365741804242134, + "rewards/qatch_metrics/mean": 0.24616562593728303, + "rewards/qatch_metrics/std": 0.3404053032398224, + "rewards/tag_count_reward/mean": 0.5248046875, + "rewards/tag_count_reward/std": 0.23647152334451677, + "step": 65 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.0046875, + "epoch": 0.12340237990304098, + "grad_norm": 0.9335750466856336, + "kl": 0.02579345703125, + "learning_rate": 1e-06, + "loss": 0.0811, + "max_completion_length": 1368.45, + "max_terminated_completion_length": 485.4, + "mean_completion_length": 222.75390625, + "mean_terminated_completion_length": 207.36048126220703, + "min_completion_length": 34.7, + "min_terminated_completion_length": 34.7, + "num_tokens": 9576424.0, + "reward": 0.24611930586397648, + "reward_std": 0.22903760597109796, + "rewards/format_reward/mean": 0.134375, + "rewards/format_reward/std": 0.3337091006338596, + "rewards/qatch_metrics/mean": 0.24012656770646573, + "rewards/qatch_metrics/std": 0.33230473324656484, + "rewards/tag_count_reward/mean": 0.571484375, + "rewards/tag_count_reward/std": 0.26808963865041735, + "step": 70 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.00234375, + "epoch": 0.13221683561040107, + "grad_norm": 1.2284202473881727, + "kl": 0.0276123046875, + "learning_rate": 1e-06, + "loss": 0.0495, + "max_completion_length": 927.9, + "max_terminated_completion_length": 564.7, + "mean_completion_length": 181.3296875, + "mean_terminated_completion_length": 172.18771896362304, + "min_completion_length": 23.45, + "min_terminated_completion_length": 23.45, + "num_tokens": 10314206.0, + "reward": 0.24534607045352458, + "reward_std": 0.2094151984900236, + "rewards/format_reward/mean": 0.23828125, + "rewards/format_reward/std": 0.4178183376789093, + "rewards/qatch_metrics/mean": 0.22585521470755338, + "rewards/qatch_metrics/std": 0.32939945682883265, + "rewards/tag_count_reward/mean": 0.5908203125, + "rewards/tag_count_reward/std": 0.2986594527959824, + "step": 75 + }, + { + "clip_ratio": 0.0, + "clipped_completions_ratio": 0.00390625, + "epoch": 0.14103129131776113, + "grad_norm": 1.142379485289819, + "kl": 0.051806640625, + "learning_rate": 1e-06, + "loss": 0.0794, + "max_completion_length": 725.95, + "max_terminated_completion_length": 543.8, + "mean_completion_length": 172.446875, + "mean_terminated_completion_length": 165.8302963256836, + "min_completion_length": 39.55, + "min_terminated_completion_length": 39.55, + "num_tokens": 11001674.0, + "reward": 0.3384398899972439, + "reward_std": 0.24649502858519554, + "rewards/format_reward/mean": 0.54296875, + "rewards/format_reward/std": 0.4884683877229691, + "rewards/qatch_metrics/mean": 0.28772110007703305, + "rewards/qatch_metrics/std": 0.38454234302043916, + "rewards/tag_count_reward/mean": 0.7916015625, + "rewards/tag_count_reward/std": 0.2731324777007103, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 2780.4, + "completions/max_terminated_length": 941.2, + "completions/mean_length": 178.7609375, + "completions/mean_terminated_length": 163.41346435546876, + "completions/min_length": 26.8, + "completions/min_terminated_length": 26.8, + "epoch": 0.1498457470251212, + "grad_norm": 2.58777855961075, + "kl": 0.07294921875, + "learning_rate": 1e-06, + "loss": 0.1454, + "num_tokens": 717486.0, + "reward": 0.312135910987854, + "reward_std": 0.2266964465379715, + "rewards/format_reward/mean": 0.77265625, + "rewards/format_reward/std": 0.4172531723976135, + "rewards/qatch_metrics/mean": 0.22495078444480895, + "rewards/qatch_metrics/std": 0.3556622087955475, + "rewards/tag_count_reward/mean": 0.8732421875, + "rewards/tag_count_reward/std": 0.25258718729019164, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 743.6, + "completions/max_terminated_length": 743.6, + "completions/mean_length": 140.06875, + "completions/mean_terminated_length": 140.06875, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 0.15866020273248127, + "grad_norm": 1.3938857270995895, + "kl": 0.0706787109375, + "learning_rate": 1e-06, + "loss": 0.0265, + "num_tokens": 1363254.0, + "reward": 0.3290148377418518, + "reward_std": 0.220520544052124, + "rewards/format_reward/mean": 0.75703125, + "rewards/format_reward/std": 0.4248744070529938, + "rewards/qatch_metrics/mean": 0.24871459007263183, + "rewards/qatch_metrics/std": 0.367422616481781, + "rewards/tag_count_reward/mean": 0.8380859375, + "rewards/tag_count_reward/std": 0.2936785161495209, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2624.8, + "completions/max_terminated_length": 1089.0, + "completions/mean_length": 144.7390625, + "completions/mean_terminated_length": 132.38715515136718, + "completions/min_length": 38.6, + "completions/min_terminated_length": 38.6, + "epoch": 0.16747465843984133, + "grad_norm": 1.271013018051317, + "kl": 0.0874755859375, + "learning_rate": 1e-06, + "loss": 0.1072, + "num_tokens": 1984568.0, + "reward": 0.4227922260761261, + "reward_std": 0.22174089550971984, + "rewards/format_reward/mean": 0.9515625, + "rewards/format_reward/std": 0.21198658645153046, + "rewards/qatch_metrics/mean": 0.32884793281555175, + "rewards/qatch_metrics/std": 0.411483907699585, + "rewards/tag_count_reward/mean": 0.9623046875, + "rewards/tag_count_reward/std": 0.15252943634986876, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1262.2, + "completions/max_terminated_length": 539.4, + "completions/mean_length": 135.00625, + "completions/mean_terminated_length": 128.81027221679688, + "completions/min_length": 37.4, + "completions/min_terminated_length": 37.4, + "epoch": 0.17628911414720141, + "grad_norm": 1.4288833040090947, + "kl": 0.093994140625, + "learning_rate": 1e-06, + "loss": 0.0551, + "num_tokens": 2650624.0, + "reward": 0.3893065094947815, + "reward_std": 0.21477862894535066, + "rewards/format_reward/mean": 0.96171875, + "rewards/format_reward/std": 0.1881812334060669, + "rewards/qatch_metrics/mean": 0.28727005124092103, + "rewards/qatch_metrics/std": 0.3896294891834259, + "rewards/tag_count_reward/mean": 0.9791015625, + "rewards/tag_count_reward/std": 0.11189484894275666, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1890.2, + "completions/max_terminated_length": 462.4, + "completions/mean_length": 132.55703125, + "completions/mean_terminated_length": 120.11187286376953, + "completions/min_length": 35.8, + "completions/min_terminated_length": 35.8, + "epoch": 0.18510356985456147, + "grad_norm": 1.1514380769030061, + "kl": 0.09775390625, + "learning_rate": 1e-06, + "loss": 0.0498, + "num_tokens": 3266889.0, + "reward": 0.3714154362678528, + "reward_std": 0.1868872672319412, + "rewards/format_reward/mean": 0.9484375, + "rewards/format_reward/std": 0.21911896765232086, + "rewards/qatch_metrics/mean": 0.2678416669368744, + "rewards/qatch_metrics/std": 0.36030757427215576, + "rewards/tag_count_reward/mean": 0.978125, + "rewards/tag_count_reward/std": 0.10167990401387214, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00234375, + "completions/max_length": 2149.8, + "completions/max_terminated_length": 642.6, + "completions/mean_length": 131.7640625, + "completions/mean_terminated_length": 122.44811248779297, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 0.19391802556192156, + "grad_norm": 1.1648416805076183, + "kl": 0.0974853515625, + "learning_rate": 1e-06, + "loss": 0.1006, + "num_tokens": 3884011.0, + "reward": 0.45167279839515684, + "reward_std": 0.2551474153995514, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.2396955519914627, + "rewards/qatch_metrics/mean": 0.3636867344379425, + "rewards/qatch_metrics/std": 0.4195810675621033, + "rewards/tag_count_reward/mean": 0.97578125, + "rewards/tag_count_reward/std": 0.10661737024784088, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1134.6, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 126.67265625, + "completions/mean_terminated_length": 123.56720428466797, + "completions/min_length": 32.8, + "completions/min_terminated_length": 32.8, + "epoch": 0.20273248126928162, + "grad_norm": 1.112770785107892, + "kl": 0.0981201171875, + "learning_rate": 1e-06, + "loss": 0.0582, + "num_tokens": 4527864.0, + "reward": 0.45048635005950927, + "reward_std": 0.24212915897369386, + "rewards/format_reward/mean": 0.90234375, + "rewards/format_reward/std": 0.2969411134719849, + "rewards/qatch_metrics/mean": 0.3671622335910797, + "rewards/qatch_metrics/std": 0.4069118857383728, + "rewards/tag_count_reward/mean": 0.96328125, + "rewards/tag_count_reward/std": 0.12083393186330796, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2718.2, + "completions/max_terminated_length": 666.6, + "completions/mean_length": 135.815625, + "completions/mean_terminated_length": 123.39766387939453, + "completions/min_length": 40.6, + "completions/min_terminated_length": 40.6, + "epoch": 0.2115469369766417, + "grad_norm": 2.8196404883813453, + "kl": 0.10830078125, + "learning_rate": 1e-06, + "loss": 0.0955, + "num_tokens": 5204604.0, + "reward": 0.4444663166999817, + "reward_std": 0.22749231457710267, + "rewards/format_reward/mean": 0.9078125, + "rewards/format_reward/std": 0.28959383964538576, + "rewards/qatch_metrics/mean": 0.3591492176055908, + "rewards/qatch_metrics/std": 0.4204003632068634, + "rewards/tag_count_reward/mean": 0.9681640625, + "rewards/tag_count_reward/std": 0.11976957470178604, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2639.8, + "completions/max_terminated_length": 459.2, + "completions/mean_length": 147.5625, + "completions/mean_terminated_length": 135.18407287597657, + "completions/min_length": 34.8, + "completions/min_terminated_length": 34.8, + "epoch": 0.22036139268400176, + "grad_norm": 1.2443606977665935, + "kl": 0.09527587890625, + "learning_rate": 1e-06, + "loss": 0.0903, + "num_tokens": 5850844.0, + "reward": 0.4391818165779114, + "reward_std": 0.24111129343509674, + "rewards/format_reward/mean": 0.88359375, + "rewards/format_reward/std": 0.32070607542991636, + "rewards/qatch_metrics/mean": 0.35628697872161863, + "rewards/qatch_metrics/std": 0.41108678579330443, + "rewards/tag_count_reward/mean": 0.9595703125, + "rewards/tag_count_reward/std": 0.1286213666200638, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00234375, + "completions/max_length": 2638.2, + "completions/max_terminated_length": 729.4, + "completions/mean_length": 148.103125, + "completions/mean_terminated_length": 138.83536682128906, + "completions/min_length": 43.4, + "completions/min_terminated_length": 43.4, + "epoch": 0.22917584839136185, + "grad_norm": 1.2078511430760708, + "kl": 0.08446044921875, + "learning_rate": 1e-06, + "loss": 0.1052, + "num_tokens": 6480960.0, + "reward": 0.44752122163772584, + "reward_std": 0.23120047450065612, + "rewards/format_reward/mean": 0.87109375, + "rewards/format_reward/std": 0.33249542117118835, + "rewards/qatch_metrics/mean": 0.3680166721343994, + "rewards/qatch_metrics/std": 0.4190321207046509, + "rewards/tag_count_reward/mean": 0.951953125, + "rewards/tag_count_reward/std": 0.14520585983991624, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 2009.2, + "completions/max_terminated_length": 904.6, + "completions/mean_length": 152.75390625, + "completions/mean_terminated_length": 146.58113708496094, + "completions/min_length": 44.2, + "completions/min_terminated_length": 44.2, + "epoch": 0.2379903040987219, + "grad_norm": 0.9663165749537755, + "kl": 0.084228515625, + "learning_rate": 1e-06, + "loss": 0.0976, + "num_tokens": 7141781.0, + "reward": 0.4069031774997711, + "reward_std": 0.24234023094177246, + "rewards/format_reward/mean": 0.86015625, + "rewards/format_reward/std": 0.34610814452171323, + "rewards/qatch_metrics/mean": 0.3220804750919342, + "rewards/qatch_metrics/std": 0.4035941183567047, + "rewards/tag_count_reward/mean": 0.9423828125, + "rewards/tag_count_reward/std": 0.16873225271701814, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1271.4, + "completions/max_terminated_length": 543.8, + "completions/mean_length": 142.09921875, + "completions/mean_terminated_length": 139.02362060546875, + "completions/min_length": 46.4, + "completions/min_terminated_length": 46.4, + "epoch": 0.24680475980608196, + "grad_norm": 1.120567941307361, + "kl": 0.0905029296875, + "learning_rate": 1e-06, + "loss": 0.0802, + "num_tokens": 7791572.0, + "reward": 0.44534188508987427, + "reward_std": 0.24042359590530396, + "rewards/format_reward/mean": 0.878125, + "rewards/format_reward/std": 0.3254675090312958, + "rewards/qatch_metrics/mean": 0.36461407542228697, + "rewards/qatch_metrics/std": 0.42095342874526975, + "rewards/tag_count_reward/mean": 0.9521484375, + "rewards/tag_count_reward/std": 0.15115214437246322, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1145.2, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 134.07890625, + "completions/mean_terminated_length": 130.97618713378907, + "completions/min_length": 40.8, + "completions/min_terminated_length": 40.8, + "epoch": 0.255619215513442, + "grad_norm": 1.2960180615344776, + "kl": 0.090283203125, + "learning_rate": 1e-06, + "loss": 0.0413, + "num_tokens": 8406681.0, + "reward": 0.4552301824092865, + "reward_std": 0.238674333691597, + "rewards/format_reward/mean": 0.9109375, + "rewards/format_reward/std": 0.280667769908905, + "rewards/qatch_metrics/mean": 0.3717781364917755, + "rewards/qatch_metrics/std": 0.4111446261405945, + "rewards/tag_count_reward/mean": 0.9625, + "rewards/tag_count_reward/std": 0.13774650245904924, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1219.0, + "completions/max_terminated_length": 542.6, + "completions/mean_length": 137.4578125, + "completions/mean_terminated_length": 134.36369934082032, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 0.26443367122080214, + "grad_norm": 1.1814014331114655, + "kl": 0.091650390625, + "learning_rate": 1e-06, + "loss": 0.0474, + "num_tokens": 9060515.0, + "reward": 0.4308152377605438, + "reward_std": 0.25072828829288485, + "rewards/format_reward/mean": 0.92265625, + "rewards/format_reward/std": 0.2654747039079666, + "rewards/qatch_metrics/mean": 0.3416989743709564, + "rewards/qatch_metrics/std": 0.4145464479923248, + "rewards/tag_count_reward/mean": 0.962109375, + "rewards/tag_count_reward/std": 0.14527225494384766, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 3343.2, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 146.34140625, + "completions/mean_terminated_length": 133.97686157226562, + "completions/min_length": 39.6, + "completions/min_terminated_length": 39.6, + "epoch": 0.2732481269281622, + "grad_norm": 1.119412659054034, + "kl": 0.0925048828125, + "learning_rate": 1e-06, + "loss": 0.108, + "num_tokens": 9726008.0, + "reward": 0.4162511765956879, + "reward_std": 0.22437838315963746, + "rewards/format_reward/mean": 0.93671875, + "rewards/format_reward/std": 0.2371742010116577, + "rewards/qatch_metrics/mean": 0.3222440242767334, + "rewards/qatch_metrics/std": 0.3945153594017029, + "rewards/tag_count_reward/mean": 0.9734375, + "rewards/tag_count_reward/std": 0.11057026386260986, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.8, + "completions/max_terminated_length": 494.8, + "completions/mean_length": 125.86171875, + "completions/mean_terminated_length": 125.86171875, + "completions/min_length": 38.4, + "completions/min_terminated_length": 38.4, + "epoch": 0.28206258263552225, + "grad_norm": 1.0577575352944335, + "kl": 0.110693359375, + "learning_rate": 1e-06, + "loss": 0.0378, + "num_tokens": 10365223.0, + "reward": 0.49046963453292847, + "reward_std": 0.22210898101329804, + "rewards/format_reward/mean": 0.95234375, + "rewards/format_reward/std": 0.21217795908451081, + "rewards/qatch_metrics/mean": 0.40703229904174804, + "rewards/qatch_metrics/std": 0.4201949179172516, + "rewards/tag_count_reward/mean": 0.98515625, + "rewards/tag_count_reward/std": 0.07672805488109588, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1195.2, + "completions/max_terminated_length": 445.8, + "completions/mean_length": 134.734375, + "completions/mean_terminated_length": 131.63231811523437, + "completions/min_length": 39.2, + "completions/min_terminated_length": 39.2, + "epoch": 0.2908770383428823, + "grad_norm": 1.1665219069490207, + "kl": 0.0982666015625, + "learning_rate": 1e-06, + "loss": 0.0211, + "num_tokens": 10984403.0, + "reward": 0.47783067226409914, + "reward_std": 0.2358974426984787, + "rewards/format_reward/mean": 0.96015625, + "rewards/format_reward/std": 0.1951357364654541, + "rewards/qatch_metrics/mean": 0.3911289095878601, + "rewards/qatch_metrics/std": 0.4188136160373688, + "rewards/tag_count_reward/mean": 0.987109375, + "rewards/tag_count_reward/std": 0.06311970800161362, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1121.6, + "completions/max_terminated_length": 376.6, + "completions/mean_length": 148.51640625, + "completions/mean_terminated_length": 145.42068176269532, + "completions/min_length": 42.6, + "completions/min_terminated_length": 42.6, + "epoch": 0.2996914940502424, + "grad_norm": 1.217339460476114, + "kl": 0.0956298828125, + "learning_rate": 1e-06, + "loss": 0.046, + "num_tokens": 11677144.0, + "reward": 0.446321702003479, + "reward_std": 0.23696185946464537, + "rewards/format_reward/mean": 0.925, + "rewards/format_reward/std": 0.2625602900981903, + "rewards/qatch_metrics/mean": 0.35908021926879885, + "rewards/qatch_metrics/std": 0.3949739336967468, + "rewards/tag_count_reward/mean": 0.9720703125, + "rewards/tag_count_reward/std": 0.1141625314950943, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1986.6, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 159.27265625, + "completions/mean_terminated_length": 153.1066864013672, + "completions/min_length": 40.2, + "completions/min_terminated_length": 40.2, + "epoch": 0.3085059497576025, + "grad_norm": 1.1604091154811884, + "kl": 0.0948486328125, + "learning_rate": 1e-06, + "loss": 0.0555, + "num_tokens": 12347509.0, + "reward": 0.43740702271461485, + "reward_std": 0.2165643662214279, + "rewards/format_reward/mean": 0.8984375, + "rewards/format_reward/std": 0.30079524517059325, + "rewards/qatch_metrics/mean": 0.35224584937095643, + "rewards/qatch_metrics/std": 0.4033379018306732, + "rewards/tag_count_reward/mean": 0.9630859375, + "rewards/tag_count_reward/std": 0.12287088185548782, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00234375, + "completions/max_length": 1919.6, + "completions/max_terminated_length": 460.2, + "completions/mean_length": 161.9671875, + "completions/mean_terminated_length": 152.73724365234375, + "completions/min_length": 41.4, + "completions/min_terminated_length": 41.4, + "epoch": 0.31732040546496254, + "grad_norm": 1.016812518785413, + "kl": 0.0965087890625, + "learning_rate": 1e-06, + "loss": 0.0535, + "num_tokens": 13026443.0, + "reward": 0.49172326922416687, + "reward_std": 0.2285678654909134, + "rewards/format_reward/mean": 0.903125, + "rewards/format_reward/std": 0.2938369959592819, + "rewards/qatch_metrics/mean": 0.41588308215141295, + "rewards/qatch_metrics/std": 0.43461284041404724, + "rewards/tag_count_reward/mean": 0.958203125, + "rewards/tag_count_reward/std": 0.13759158551692963, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1250.8, + "completions/max_terminated_length": 516.6, + "completions/mean_length": 153.14375, + "completions/mean_terminated_length": 150.06268920898438, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.3261348611723226, + "grad_norm": 2.943551495922741, + "kl": 0.1255859375, + "learning_rate": 1e-06, + "loss": 0.0417, + "num_tokens": 13684611.0, + "reward": 0.4662940502166748, + "reward_std": 0.22654231786727905, + "rewards/format_reward/mean": 0.909375, + "rewards/format_reward/std": 0.2848878413438797, + "rewards/qatch_metrics/mean": 0.3857020795345306, + "rewards/qatch_metrics/std": 0.4162748992443085, + "rewards/tag_count_reward/mean": 0.9501953125, + "rewards/tag_count_reward/std": 0.1399885058403015, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2653.0, + "completions/max_terminated_length": 786.2, + "completions/mean_length": 160.4765625, + "completions/mean_terminated_length": 148.1515380859375, + "completions/min_length": 47.6, + "completions/min_terminated_length": 47.6, + "epoch": 0.33494931687968266, + "grad_norm": 1.1104622942084277, + "kl": 0.0940185546875, + "learning_rate": 1e-06, + "loss": 0.0702, + "num_tokens": 14338485.0, + "reward": 0.5249280750751495, + "reward_std": 0.22171878814697266, + "rewards/format_reward/mean": 0.89765625, + "rewards/format_reward/std": 0.3031489491462708, + "rewards/qatch_metrics/mean": 0.45655598640441897, + "rewards/qatch_metrics/std": 0.43402122855186465, + "rewards/tag_count_reward/mean": 0.941796875, + "rewards/tag_count_reward/std": 0.16171995401382447, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1142.6, + "completions/max_terminated_length": 420.2, + "completions/mean_length": 142.35078125, + "completions/mean_terminated_length": 139.25772094726562, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.34376377258704277, + "grad_norm": 1.0499982391674234, + "kl": 0.1005126953125, + "learning_rate": 1e-06, + "loss": 0.0386, + "num_tokens": 14972214.0, + "reward": 0.46582343578338625, + "reward_std": 0.22475437819957733, + "rewards/format_reward/mean": 0.93671875, + "rewards/format_reward/std": 0.2403053015470505, + "rewards/qatch_metrics/mean": 0.3805643320083618, + "rewards/qatch_metrics/std": 0.4070888340473175, + "rewards/tag_count_reward/mean": 0.9734375, + "rewards/tag_count_reward/std": 0.11377856433391571, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 2080.4, + "completions/max_terminated_length": 718.8, + "completions/mean_length": 147.2296875, + "completions/mean_terminated_length": 141.05357666015624, + "completions/min_length": 41.2, + "completions/min_terminated_length": 41.2, + "epoch": 0.35257822829440283, + "grad_norm": 1.0220429404852622, + "kl": 0.1034912109375, + "learning_rate": 1e-06, + "loss": 0.0575, + "num_tokens": 15639820.0, + "reward": 0.4750072777271271, + "reward_std": 0.2135873943567276, + "rewards/format_reward/mean": 0.92421875, + "rewards/format_reward/std": 0.26192537546157835, + "rewards/qatch_metrics/mean": 0.39318412244319917, + "rewards/qatch_metrics/std": 0.39594073295593263, + "rewards/tag_count_reward/mean": 0.967578125, + "rewards/tag_count_reward/std": 0.1268332213163376, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1109.2, + "completions/max_terminated_length": 358.6, + "completions/mean_length": 139.92421875, + "completions/mean_terminated_length": 133.72031860351564, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.3613926840017629, + "grad_norm": 1.1782501720519973, + "kl": 0.1066162109375, + "learning_rate": 1e-06, + "loss": 0.0398, + "num_tokens": 16303995.0, + "reward": 0.4705925226211548, + "reward_std": 0.18503921926021577, + "rewards/format_reward/mean": 0.95, + "rewards/format_reward/std": 0.21313293874263764, + "rewards/qatch_metrics/mean": 0.3844171941280365, + "rewards/qatch_metrics/std": 0.37895620465278623, + "rewards/tag_count_reward/mean": 0.9767578125, + "rewards/tag_count_reward/std": 0.10622683316469192, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1162.8, + "completions/max_terminated_length": 442.2, + "completions/mean_length": 137.20234375, + "completions/mean_terminated_length": 134.1083190917969, + "completions/min_length": 42.8, + "completions/min_terminated_length": 42.8, + "epoch": 0.37020713970912295, + "grad_norm": 1.08337248687343, + "kl": 0.103369140625, + "learning_rate": 1e-06, + "loss": 0.021, + "num_tokens": 16916398.0, + "reward": 0.5206815063953399, + "reward_std": 0.23496688902378082, + "rewards/format_reward/mean": 0.9484375, + "rewards/format_reward/std": 0.22106002569198607, + "rewards/qatch_metrics/mean": 0.44340285658836365, + "rewards/qatch_metrics/std": 0.4299581289291382, + "rewards/tag_count_reward/mean": 0.97890625, + "rewards/tag_count_reward/std": 0.0956751674413681, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2602.8, + "completions/max_terminated_length": 365.2, + "completions/mean_length": 152.73984375, + "completions/mean_terminated_length": 140.39059753417968, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.37902159541648306, + "grad_norm": 1.0746862969416158, + "kl": 0.1025390625, + "learning_rate": 1e-06, + "loss": 0.0729, + "num_tokens": 17589905.0, + "reward": 0.4809570789337158, + "reward_std": 0.2217806786298752, + "rewards/format_reward/mean": 0.95, + "rewards/format_reward/std": 0.2149397164583206, + "rewards/qatch_metrics/mean": 0.3963695228099823, + "rewards/qatch_metrics/std": 0.4306588113307953, + "rewards/tag_count_reward/mean": 0.980859375, + "rewards/tag_count_reward/std": 0.09641121476888656, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 2013.2, + "completions/max_terminated_length": 590.6, + "completions/mean_length": 151.1234375, + "completions/mean_terminated_length": 144.9534942626953, + "completions/min_length": 43.6, + "completions/min_terminated_length": 43.6, + "epoch": 0.3878360511238431, + "grad_norm": 1.1093719128866055, + "kl": 0.0995849609375, + "learning_rate": 1e-06, + "loss": 0.0538, + "num_tokens": 18226287.0, + "reward": 0.5117225289344788, + "reward_std": 0.23414760828018188, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.2397076427936554, + "rewards/qatch_metrics/mean": 0.43437943458557127, + "rewards/qatch_metrics/std": 0.4301003873348236, + "rewards/tag_count_reward/mean": 0.975, + "rewards/tag_count_reward/std": 0.10860425382852554, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1985.0, + "completions/max_terminated_length": 565.6, + "completions/mean_length": 158.75078125, + "completions/mean_terminated_length": 152.5970947265625, + "completions/min_length": 43.6, + "completions/min_terminated_length": 43.6, + "epoch": 0.3966505068312032, + "grad_norm": 1.0904499601554711, + "kl": 0.1022705078125, + "learning_rate": 1e-06, + "loss": 0.0853, + "num_tokens": 18930000.0, + "reward": 0.5386084854602814, + "reward_std": 0.19289222061634065, + "rewards/format_reward/mean": 0.9125, + "rewards/format_reward/std": 0.2818691849708557, + "rewards/qatch_metrics/mean": 0.46959454417228697, + "rewards/qatch_metrics/std": 0.42910557985305786, + "rewards/tag_count_reward/mean": 0.9640625, + "rewards/tag_count_reward/std": 0.13219460248947143, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1952.6, + "completions/max_terminated_length": 454.4, + "completions/mean_length": 162.29296875, + "completions/mean_terminated_length": 156.1261993408203, + "completions/min_length": 52.4, + "completions/min_terminated_length": 52.4, + "epoch": 0.40546496253856323, + "grad_norm": 1.0851594244493181, + "kl": 0.1014404296875, + "learning_rate": 1e-06, + "loss": 0.0638, + "num_tokens": 19599671.0, + "reward": 0.5117276430130004, + "reward_std": 0.24680890440940856, + "rewards/format_reward/mean": 0.8921875, + "rewards/format_reward/std": 0.30913242101669314, + "rewards/qatch_metrics/mean": 0.4411179721355438, + "rewards/qatch_metrics/std": 0.43982199430465696, + "rewards/tag_count_reward/mean": 0.951171875, + "rewards/tag_count_reward/std": 0.15551512241363524, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1092.6, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 151.4390625, + "completions/mean_terminated_length": 148.35447387695314, + "completions/min_length": 42.8, + "completions/min_terminated_length": 42.8, + "epoch": 0.4142794182459233, + "grad_norm": 0.9747077221272922, + "kl": 0.1077880859375, + "learning_rate": 1e-06, + "loss": 0.0322, + "num_tokens": 20243993.0, + "reward": 0.5243084728717804, + "reward_std": 0.23080018162727356, + "rewards/format_reward/mean": 0.91640625, + "rewards/format_reward/std": 0.2711502879858017, + "rewards/qatch_metrics/mean": 0.4524148523807526, + "rewards/qatch_metrics/std": 0.4158477485179901, + "rewards/tag_count_reward/mean": 0.9623046875, + "rewards/tag_count_reward/std": 0.1330309897661209, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1154.2, + "completions/max_terminated_length": 407.8, + "completions/mean_length": 149.76953125, + "completions/mean_terminated_length": 146.6907531738281, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.4230938739532834, + "grad_norm": 0.9985959553205879, + "kl": 0.1137451171875, + "learning_rate": 1e-06, + "loss": 0.0347, + "num_tokens": 20888562.0, + "reward": 0.45308218002319334, + "reward_std": 0.21805870532989502, + "rewards/format_reward/mean": 0.92109375, + "rewards/format_reward/std": 0.2692244678735733, + "rewards/qatch_metrics/mean": 0.36815963983535765, + "rewards/qatch_metrics/std": 0.4063821077346802, + "rewards/tag_count_reward/mean": 0.9607421875, + "rewards/tag_count_reward/std": 0.14251872897148132, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1154.8, + "completions/max_terminated_length": 422.6, + "completions/mean_length": 141.10703125, + "completions/mean_terminated_length": 138.00346984863282, + "completions/min_length": 42.4, + "completions/min_terminated_length": 42.4, + "epoch": 0.43190832966064346, + "grad_norm": 1.206317234569705, + "kl": 0.1264892578125, + "learning_rate": 1e-06, + "loss": 0.033, + "num_tokens": 21529883.0, + "reward": 0.5218020260334015, + "reward_std": 0.21800511479377746, + "rewards/format_reward/mean": 0.94296875, + "rewards/format_reward/std": 0.23152050971984864, + "rewards/qatch_metrics/mean": 0.44588152766227723, + "rewards/qatch_metrics/std": 0.41684806942939756, + "rewards/tag_count_reward/mean": 0.9701171875, + "rewards/tag_count_reward/std": 0.12424642890691757, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00234375, + "completions/max_length": 1911.8, + "completions/max_terminated_length": 458.6, + "completions/mean_length": 154.50234375, + "completions/mean_terminated_length": 145.24712524414062, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.4407227853680035, + "grad_norm": 1.0646070893278259, + "kl": 0.108837890625, + "learning_rate": 1e-06, + "loss": 0.0425, + "num_tokens": 22202974.0, + "reward": 0.5250638484954834, + "reward_std": 0.23499601781368257, + "rewards/format_reward/mean": 0.92578125, + "rewards/format_reward/std": 0.26019937098026275, + "rewards/qatch_metrics/mean": 0.4519937574863434, + "rewards/qatch_metrics/std": 0.43260250687599183, + "rewards/tag_count_reward/mean": 0.9658203125, + "rewards/tag_count_reward/std": 0.13058245778083802, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1164.6, + "completions/max_terminated_length": 436.8, + "completions/mean_length": 149.890625, + "completions/mean_terminated_length": 146.80228271484376, + "completions/min_length": 50.8, + "completions/min_terminated_length": 50.8, + "epoch": 0.4495372410753636, + "grad_norm": 1.0628232924013388, + "kl": 0.1126220703125, + "learning_rate": 1e-06, + "loss": 0.0354, + "num_tokens": 22890018.0, + "reward": 0.49608793258666994, + "reward_std": 0.24011301696300508, + "rewards/format_reward/mean": 0.91640625, + "rewards/format_reward/std": 0.2767932593822479, + "rewards/qatch_metrics/mean": 0.4192716181278229, + "rewards/qatch_metrics/std": 0.4066218316555023, + "rewards/tag_count_reward/mean": 0.961328125, + "rewards/tag_count_reward/std": 0.1395171895623207, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 716.8, + "completions/max_terminated_length": 716.8, + "completions/mean_length": 156.6890625, + "completions/mean_terminated_length": 156.6890625, + "completions/min_length": 47.4, + "completions/min_terminated_length": 47.4, + "epoch": 0.4583516967827237, + "grad_norm": 1.0620716307535578, + "kl": 0.1146728515625, + "learning_rate": 1e-06, + "loss": 0.0473, + "num_tokens": 23537540.0, + "reward": 0.46006324887275696, + "reward_std": 0.20722155570983886, + "rewards/format_reward/mean": 0.9140625, + "rewards/format_reward/std": 0.2800079345703125, + "rewards/qatch_metrics/mean": 0.3774526119232178, + "rewards/qatch_metrics/std": 0.40044850707054136, + "rewards/tag_count_reward/mean": 0.9564453125, + "rewards/tag_count_reward/std": 0.15245147049427032, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.2, + "completions/max_terminated_length": 556.2, + "completions/mean_length": 143.2, + "completions/mean_terminated_length": 143.2, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.46716615249008375, + "grad_norm": 1.0417871765342246, + "kl": 0.120166015625, + "learning_rate": 1e-06, + "loss": 0.0284, + "num_tokens": 24185028.0, + "reward": 0.6036619067192077, + "reward_std": 0.20974452793598175, + "rewards/format_reward/mean": 0.9546875, + "rewards/format_reward/std": 0.20255258679389954, + "rewards/qatch_metrics/mean": 0.5404294610023499, + "rewards/qatch_metrics/std": 0.4255226194858551, + "rewards/tag_count_reward/mean": 0.9765625, + "rewards/tag_count_reward/std": 0.11209065765142441, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1263.4, + "completions/max_terminated_length": 545.8, + "completions/mean_length": 153.36640625, + "completions/mean_terminated_length": 150.29201049804686, + "completions/min_length": 48.2, + "completions/min_terminated_length": 48.2, + "epoch": 0.4759806081974438, + "grad_norm": 1.2230142515821079, + "kl": 0.12080078125, + "learning_rate": 1e-06, + "loss": 0.0454, + "num_tokens": 24848617.0, + "reward": 0.5000587105751038, + "reward_std": 0.19942412078380584, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.2082734227180481, + "rewards/qatch_metrics/mean": 0.4188075542449951, + "rewards/qatch_metrics/std": 0.4028447926044464, + "rewards/tag_count_reward/mean": 0.9751953125, + "rewards/tag_count_reward/std": 0.11493908390402793, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 146.3046875, + "completions/mean_terminated_length": 146.3046875, + "completions/min_length": 45.6, + "completions/min_terminated_length": 45.6, + "epoch": 0.48479506390480387, + "grad_norm": 1.11053365840032, + "kl": 0.1185302734375, + "learning_rate": 1e-06, + "loss": 0.0403, + "num_tokens": 25530191.0, + "reward": 0.5597240447998046, + "reward_std": 0.22672632932662964, + "rewards/format_reward/mean": 0.96328125, + "rewards/format_reward/std": 0.18625771403312683, + "rewards/qatch_metrics/mean": 0.4874395847320557, + "rewards/qatch_metrics/std": 0.41059340834617614, + "rewards/tag_count_reward/mean": 0.9814453125, + "rewards/tag_count_reward/std": 0.09651189893484116, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1201.8, + "completions/max_terminated_length": 482.4, + "completions/mean_length": 142.7421875, + "completions/mean_terminated_length": 139.64728698730468, + "completions/min_length": 45.2, + "completions/min_terminated_length": 45.2, + "epoch": 0.4936095196121639, + "grad_norm": 1.0811085458763714, + "kl": 0.126318359375, + "learning_rate": 1e-06, + "loss": 0.0566, + "num_tokens": 26189061.0, + "reward": 0.4923192024230957, + "reward_std": 0.197740375995636, + "rewards/format_reward/mean": 0.9578125, + "rewards/format_reward/std": 0.1986761748790741, + "rewards/qatch_metrics/mean": 0.40876016914844515, + "rewards/qatch_metrics/std": 0.3987727761268616, + "rewards/tag_count_reward/mean": 0.9818359375, + "rewards/tag_count_reward/std": 0.08791020289063453, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1150.6, + "completions/max_terminated_length": 1150.6, + "completions/mean_length": 149.32734375, + "completions/mean_terminated_length": 149.32734375, + "completions/min_length": 43.6, + "completions/min_terminated_length": 43.6, + "epoch": 0.502423975319524, + "grad_norm": 1.061030164323382, + "kl": 0.1281982421875, + "learning_rate": 1e-06, + "loss": 0.0294, + "num_tokens": 26873144.0, + "reward": 0.47796512842178346, + "reward_std": 0.21353891789913176, + "rewards/format_reward/mean": 0.95, + "rewards/format_reward/std": 0.21780532896518706, + "rewards/qatch_metrics/mean": 0.392907041311264, + "rewards/qatch_metrics/std": 0.4109325408935547, + "rewards/tag_count_reward/mean": 0.9798828125, + "rewards/tag_count_reward/std": 0.09564688950777053, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1897.2, + "completions/max_terminated_length": 495.2, + "completions/mean_length": 149.51796875, + "completions/mean_terminated_length": 143.3479248046875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.511238431026884, + "grad_norm": 1.0941932001692303, + "kl": 0.1298095703125, + "learning_rate": 1e-06, + "loss": 0.0626, + "num_tokens": 27539071.0, + "reward": 0.4637163817882538, + "reward_std": 0.18355560302734375, + "rewards/format_reward/mean": 0.92109375, + "rewards/format_reward/std": 0.2646732360124588, + "rewards/qatch_metrics/mean": 0.38052110075950624, + "rewards/qatch_metrics/std": 0.3895488500595093, + "rewards/tag_count_reward/mean": 0.96328125, + "rewards/tag_count_reward/std": 0.1275038242340088, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.2, + "completions/max_terminated_length": 840.2, + "completions/mean_length": 146.7, + "completions/mean_terminated_length": 146.7, + "completions/min_length": 44.2, + "completions/min_terminated_length": 44.2, + "epoch": 0.5200528867342442, + "grad_norm": 1.156974504094534, + "kl": 0.127880859375, + "learning_rate": 1e-06, + "loss": 0.0401, + "num_tokens": 28189663.0, + "reward": 0.4947424054145813, + "reward_std": 0.2121095508337021, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.308673033118248, + "rewards/qatch_metrics/mean": 0.42144557237625124, + "rewards/qatch_metrics/std": 0.4137202322483063, + "rewards/tag_count_reward/mean": 0.9490234375, + "rewards/tag_count_reward/std": 0.15444399118423463, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1255.6, + "completions/max_terminated_length": 607.6, + "completions/mean_length": 160.95703125, + "completions/mean_terminated_length": 157.87724609375, + "completions/min_length": 46.4, + "completions/min_terminated_length": 46.4, + "epoch": 0.5288673424416043, + "grad_norm": 1.0193761202428142, + "kl": 0.1197265625, + "learning_rate": 1e-06, + "loss": 0.045, + "num_tokens": 28883656.0, + "reward": 0.5260525703430176, + "reward_std": 0.21179039478302003, + "rewards/format_reward/mean": 0.9359375, + "rewards/format_reward/std": 0.24322082698345185, + "rewards/qatch_metrics/mean": 0.4514336109161377, + "rewards/qatch_metrics/std": 0.42365469336509703, + "rewards/tag_count_reward/mean": 0.9748046875, + "rewards/tag_count_reward/std": 0.10406171679496765, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 2045.6, + "completions/max_terminated_length": 651.8, + "completions/mean_length": 167.04921875, + "completions/mean_terminated_length": 160.9025909423828, + "completions/min_length": 47.4, + "completions/min_terminated_length": 47.4, + "epoch": 0.5376817981489643, + "grad_norm": 1.0505969033833242, + "kl": 0.1183837890625, + "learning_rate": 1e-06, + "loss": 0.074, + "num_tokens": 29572327.0, + "reward": 0.5555553436279297, + "reward_std": 0.23490612506866454, + "rewards/format_reward/mean": 0.9421875, + "rewards/format_reward/std": 0.23279777467250823, + "rewards/qatch_metrics/mean": 0.48544191718101504, + "rewards/qatch_metrics/std": 0.4043150365352631, + "rewards/tag_count_reward/mean": 0.97421875, + "rewards/tag_count_reward/std": 0.11451640278100968, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00234375, + "completions/max_length": 1991.8, + "completions/max_terminated_length": 547.4, + "completions/mean_length": 163.9890625, + "completions/mean_terminated_length": 154.75635986328126, + "completions/min_length": 48.6, + "completions/min_terminated_length": 48.6, + "epoch": 0.5464962538563244, + "grad_norm": 1.0184747818610653, + "kl": 0.1130615234375, + "learning_rate": 1e-06, + "loss": 0.0564, + "num_tokens": 30237305.0, + "reward": 0.5232127249240875, + "reward_std": 0.22367032766342163, + "rewards/format_reward/mean": 0.92890625, + "rewards/format_reward/std": 0.25724474191665647, + "rewards/qatch_metrics/mean": 0.4492989718914032, + "rewards/qatch_metrics/std": 0.4237508654594421, + "rewards/tag_count_reward/mean": 0.968359375, + "rewards/tag_count_reward/std": 0.12655377388000488, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1175.4, + "completions/max_terminated_length": 489.4, + "completions/mean_length": 165.95546875, + "completions/mean_terminated_length": 162.88382873535156, + "completions/min_length": 52.8, + "completions/min_terminated_length": 52.8, + "epoch": 0.5553107095636844, + "grad_norm": 0.926469518807395, + "kl": 0.116796875, + "learning_rate": 1e-06, + "loss": 0.034, + "num_tokens": 30935440.0, + "reward": 0.5955564260482789, + "reward_std": 0.22866220772266388, + "rewards/format_reward/mean": 0.91328125, + "rewards/format_reward/std": 0.28102830052375793, + "rewards/qatch_metrics/mean": 0.5368104040622711, + "rewards/qatch_metrics/std": 0.4246533751487732, + "rewards/tag_count_reward/mean": 0.9587890625, + "rewards/tag_count_reward/std": 0.1424473986029625, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 832.0, + "completions/max_terminated_length": 832.0, + "completions/mean_length": 173.453125, + "completions/mean_terminated_length": 173.453125, + "completions/min_length": 38.4, + "completions/min_terminated_length": 38.4, + "epoch": 0.5641251652710445, + "grad_norm": 1.013203860912236, + "kl": 0.1189208984375, + "learning_rate": 1e-06, + "loss": 0.0382, + "num_tokens": 31617748.0, + "reward": 0.5389631450176239, + "reward_std": 0.21291258931159973, + "rewards/format_reward/mean": 0.90859375, + "rewards/format_reward/std": 0.28675017356872556, + "rewards/qatch_metrics/mean": 0.47106875777244567, + "rewards/qatch_metrics/std": 0.42714625000953677, + "rewards/tag_count_reward/mean": 0.95390625, + "rewards/tag_count_reward/std": 0.16191803216934203, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2694.6, + "completions/max_terminated_length": 553.8, + "completions/mean_length": 184.3796875, + "completions/mean_terminated_length": 172.1176513671875, + "completions/min_length": 44.8, + "completions/min_terminated_length": 44.8, + "epoch": 0.5729396209784046, + "grad_norm": 1.0308637085090815, + "kl": 0.1224853515625, + "learning_rate": 1e-06, + "loss": 0.0671, + "num_tokens": 32294954.0, + "reward": 0.5857669234275817, + "reward_std": 0.215561243891716, + "rewards/format_reward/mean": 0.9359375, + "rewards/format_reward/std": 0.24411277770996093, + "rewards/qatch_metrics/mean": 0.5224210977554321, + "rewards/qatch_metrics/std": 0.4328895270824432, + "rewards/tag_count_reward/mean": 0.9623046875, + "rewards/tag_count_reward/std": 0.15089936107397078, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.4, + "completions/max_terminated_length": 496.4, + "completions/mean_length": 158.73125, + "completions/mean_terminated_length": 158.73125, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 0.5817540766857646, + "grad_norm": 13.29499090198915, + "kl": 0.1685302734375, + "learning_rate": 1e-06, + "loss": 0.0302, + "num_tokens": 32972034.0, + "reward": 0.5693387031555176, + "reward_std": 0.20702467262744903, + "rewards/format_reward/mean": 0.91640625, + "rewards/format_reward/std": 0.2766654253005981, + "rewards/qatch_metrics/mean": 0.5060119867324829, + "rewards/qatch_metrics/std": 0.4102466404438019, + "rewards/tag_count_reward/mean": 0.9517578125, + "rewards/tag_count_reward/std": 0.16461062729358672, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1304.4, + "completions/max_terminated_length": 579.8, + "completions/mean_length": 160.171875, + "completions/mean_terminated_length": 154.0051055908203, + "completions/min_length": 44.2, + "completions/min_terminated_length": 44.2, + "epoch": 0.5905685323931247, + "grad_norm": 0.935442580551547, + "kl": 0.1378662109375, + "learning_rate": 1e-06, + "loss": 0.0212, + "num_tokens": 33644622.0, + "reward": 0.5727396726608276, + "reward_std": 0.21638197600841522, + "rewards/format_reward/mean": 0.8984375, + "rewards/format_reward/std": 0.3004028916358948, + "rewards/qatch_metrics/mean": 0.5123223960399628, + "rewards/qatch_metrics/std": 0.43056052923202515, + "rewards/tag_count_reward/mean": 0.9484375, + "rewards/tag_count_reward/std": 0.15865270793437958, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1336.4, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 169.71640625, + "completions/mean_terminated_length": 163.57119750976562, + "completions/min_length": 29.2, + "completions/min_terminated_length": 29.2, + "epoch": 0.5993829881004848, + "grad_norm": 1.4959634071669228, + "kl": 0.1254638671875, + "learning_rate": 1e-06, + "loss": 0.0391, + "num_tokens": 34325763.0, + "reward": 0.5609373271465301, + "reward_std": 0.22974819540977479, + "rewards/format_reward/mean": 0.88046875, + "rewards/format_reward/std": 0.3236552834510803, + "rewards/qatch_metrics/mean": 0.5021367311477661, + "rewards/qatch_metrics/std": 0.4220038175582886, + "rewards/tag_count_reward/mean": 0.921484375, + "rewards/tag_count_reward/std": 0.21414475739002228, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 2653.0, + "completions/max_terminated_length": 500.6, + "completions/mean_length": 174.47734375, + "completions/mean_terminated_length": 159.10546264648437, + "completions/min_length": 33.6, + "completions/min_terminated_length": 33.6, + "epoch": 0.6081974438078449, + "grad_norm": 0.8936813027459303, + "kl": 0.1334228515625, + "learning_rate": 1e-06, + "loss": 0.068, + "num_tokens": 35041510.0, + "reward": 0.5479878842830658, + "reward_std": 0.24380851686000823, + "rewards/format_reward/mean": 0.915625, + "rewards/format_reward/std": 0.2778396010398865, + "rewards/qatch_metrics/mean": 0.4811346590518951, + "rewards/qatch_metrics/std": 0.43007351756095885, + "rewards/tag_count_reward/mean": 0.94921875, + "rewards/tag_count_reward/std": 0.17109024226665498, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 997.4, + "completions/max_terminated_length": 997.4, + "completions/mean_length": 170.0734375, + "completions/mean_terminated_length": 170.0734375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 0.617011899515205, + "grad_norm": 0.8929486085452816, + "kl": 0.119775390625, + "learning_rate": 1e-06, + "loss": 0.0198, + "num_tokens": 35733284.0, + "reward": 0.5295659184455872, + "reward_std": 0.2090097412467003, + "rewards/format_reward/mean": 0.909375, + "rewards/format_reward/std": 0.2840398609638214, + "rewards/qatch_metrics/mean": 0.46024298667907715, + "rewards/qatch_metrics/std": 0.4201698362827301, + "rewards/tag_count_reward/mean": 0.9484375, + "rewards/tag_count_reward/std": 0.171261465549469, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 2099.4, + "completions/max_terminated_length": 687.6, + "completions/mean_length": 177.484375, + "completions/mean_terminated_length": 171.34949951171876, + "completions/min_length": 25.4, + "completions/min_terminated_length": 25.4, + "epoch": 0.625826355222565, + "grad_norm": 0.971309519624497, + "kl": 0.11083984375, + "learning_rate": 1e-06, + "loss": 0.0143, + "num_tokens": 36425760.0, + "reward": 0.5487818002700806, + "reward_std": 0.23004478216171265, + "rewards/format_reward/mean": 0.83671875, + "rewards/format_reward/std": 0.3683928668498993, + "rewards/qatch_metrics/mean": 0.49449974298477173, + "rewards/qatch_metrics/std": 0.4279952645301819, + "rewards/tag_count_reward/mean": 0.895703125, + "rewards/tag_count_reward/std": 0.2428739696741104, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1983.4, + "completions/max_terminated_length": 626.4, + "completions/mean_length": 189.1375, + "completions/mean_terminated_length": 183.03312072753906, + "completions/min_length": 46.6, + "completions/min_terminated_length": 46.6, + "epoch": 0.6346408109299251, + "grad_norm": 1.085714923968051, + "kl": 0.111279296875, + "learning_rate": 1e-06, + "loss": 0.0412, + "num_tokens": 37146560.0, + "reward": 0.5465562880039215, + "reward_std": 0.2106493055820465, + "rewards/format_reward/mean": 0.9015625, + "rewards/format_reward/std": 0.2943433105945587, + "rewards/qatch_metrics/mean": 0.4813575744628906, + "rewards/qatch_metrics/std": 0.41973625421524047, + "rewards/tag_count_reward/mean": 0.944921875, + "rewards/tag_count_reward/std": 0.1730790615081787, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1243.6, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 177.896875, + "completions/mean_terminated_length": 174.8417541503906, + "completions/min_length": 26.8, + "completions/min_terminated_length": 26.8, + "epoch": 0.6434552666372851, + "grad_norm": 0.9896824938718284, + "kl": 0.115478515625, + "learning_rate": 1e-06, + "loss": 0.041, + "num_tokens": 37832268.0, + "reward": 0.5586158275604248, + "reward_std": 0.2251075476408005, + "rewards/format_reward/mean": 0.87421875, + "rewards/format_reward/std": 0.32932343482971194, + "rewards/qatch_metrics/mean": 0.49995704293251036, + "rewards/qatch_metrics/std": 0.4248849630355835, + "rewards/tag_count_reward/mean": 0.924609375, + "rewards/tag_count_reward/std": 0.20914700627326965, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1282.6, + "completions/max_terminated_length": 555.8, + "completions/mean_length": 170.38203125, + "completions/mean_terminated_length": 167.313427734375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.6522697223446452, + "grad_norm": 0.9278166717820118, + "kl": 0.1187255859375, + "learning_rate": 1e-06, + "loss": 0.0193, + "num_tokens": 38555925.0, + "reward": 0.5743588328361511, + "reward_std": 0.20262247920036316, + "rewards/format_reward/mean": 0.8546875, + "rewards/format_reward/std": 0.3492723762989044, + "rewards/qatch_metrics/mean": 0.5217640638351441, + "rewards/qatch_metrics/std": 0.4110603451728821, + "rewards/tag_count_reward/mean": 0.9078125, + "rewards/tag_count_reward/std": 0.22944335341453553, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.4, + "completions/max_terminated_length": 458.4, + "completions/mean_length": 171.5578125, + "completions/mean_terminated_length": 171.5578125, + "completions/min_length": 33.8, + "completions/min_terminated_length": 33.8, + "epoch": 0.6610841780520053, + "grad_norm": 0.9296512691094065, + "kl": 0.1155029296875, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 39234879.0, + "reward": 0.6218206763267518, + "reward_std": 0.20180206298828124, + "rewards/format_reward/mean": 0.90703125, + "rewards/format_reward/std": 0.29024410247802734, + "rewards/qatch_metrics/mean": 0.5695247530937195, + "rewards/qatch_metrics/std": 0.43560155630111697, + "rewards/tag_count_reward/mean": 0.9404296875, + "rewards/tag_count_reward/std": 0.1924948960542679, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1532.6, + "completions/max_terminated_length": 808.2, + "completions/mean_length": 177.89375, + "completions/mean_terminated_length": 174.8284454345703, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.6698986337593653, + "grad_norm": 0.9179048520524131, + "kl": 0.1192138671875, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 39936711.0, + "reward": 0.548683899641037, + "reward_std": 0.20842809975147247, + "rewards/format_reward/mean": 0.8875, + "rewards/format_reward/std": 0.3159508228302002, + "rewards/qatch_metrics/mean": 0.48670990467071534, + "rewards/qatch_metrics/std": 0.42913843393325807, + "rewards/tag_count_reward/mean": 0.924609375, + "rewards/tag_count_reward/std": 0.2165108621120453, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1210.4, + "completions/max_terminated_length": 476.8, + "completions/mean_length": 185.3, + "completions/mean_terminated_length": 182.22890930175782, + "completions/min_length": 37.8, + "completions/min_terminated_length": 37.8, + "epoch": 0.6787130894667255, + "grad_norm": 1.0609912938864583, + "kl": 0.1187255859375, + "learning_rate": 1e-06, + "loss": 0.0192, + "num_tokens": 40628007.0, + "reward": 0.5760585784912109, + "reward_std": 0.22054702043533325, + "rewards/format_reward/mean": 0.88984375, + "rewards/format_reward/std": 0.3102767616510391, + "rewards/qatch_metrics/mean": 0.5183065176010132, + "rewards/qatch_metrics/std": 0.4284651458263397, + "rewards/tag_count_reward/mean": 0.9302734375, + "rewards/tag_count_reward/std": 0.20196012556552886, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1221.8, + "completions/max_terminated_length": 604.6, + "completions/mean_length": 170.103125, + "completions/mean_terminated_length": 167.0412139892578, + "completions/min_length": 28.4, + "completions/min_terminated_length": 28.4, + "epoch": 0.6875275451740855, + "grad_norm": 1.0349341835531938, + "kl": 0.1177978515625, + "learning_rate": 1e-06, + "loss": 0.0142, + "num_tokens": 41336699.0, + "reward": 0.5688169717788696, + "reward_std": 0.21753813624382018, + "rewards/format_reward/mean": 0.8546875, + "rewards/format_reward/std": 0.35243783593177797, + "rewards/qatch_metrics/mean": 0.5149684965610504, + "rewards/qatch_metrics/std": 0.4279025971889496, + "rewards/tag_count_reward/mean": 0.9125, + "rewards/tag_count_reward/std": 0.22199150621891023, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1952.2, + "completions/max_terminated_length": 1016.6, + "completions/mean_length": 194.11171875, + "completions/mean_terminated_length": 188.0023986816406, + "completions/min_length": 36.4, + "completions/min_terminated_length": 36.4, + "epoch": 0.6963420008814456, + "grad_norm": 1.1198091080655637, + "kl": 0.1160400390625, + "learning_rate": 1e-06, + "loss": 0.0349, + "num_tokens": 42057866.0, + "reward": 0.5434407353401184, + "reward_std": 0.2424723982810974, + "rewards/format_reward/mean": 0.8625, + "rewards/format_reward/std": 0.3433255970478058, + "rewards/qatch_metrics/mean": 0.48393073081970217, + "rewards/qatch_metrics/std": 0.4233227550983429, + "rewards/tag_count_reward/mean": 0.9169921875, + "rewards/tag_count_reward/std": 0.21575720310211183, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1565.8, + "completions/max_terminated_length": 859.6, + "completions/mean_length": 195.30625, + "completions/mean_terminated_length": 192.24614868164062, + "completions/min_length": 36.6, + "completions/min_terminated_length": 36.6, + "epoch": 0.7051564565888057, + "grad_norm": 0.9364233280028263, + "kl": 0.109765625, + "learning_rate": 1e-06, + "loss": -0.009, + "num_tokens": 42802098.0, + "reward": 0.532480639219284, + "reward_std": 0.20860818028450012, + "rewards/format_reward/mean": 0.8578125, + "rewards/format_reward/std": 0.3485052168369293, + "rewards/qatch_metrics/mean": 0.4721968710422516, + "rewards/qatch_metrics/std": 0.4088200509548187, + "rewards/tag_count_reward/mean": 0.906640625, + "rewards/tag_count_reward/std": 0.23517801761627197, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1322.0, + "completions/max_terminated_length": 674.6, + "completions/mean_length": 191.78125, + "completions/mean_terminated_length": 188.7188934326172, + "completions/min_length": 31.8, + "completions/min_terminated_length": 31.8, + "epoch": 0.7139709122961657, + "grad_norm": 0.9184749770599845, + "kl": 0.109228515625, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 43511370.0, + "reward": 0.6411563873291015, + "reward_std": 0.206281441450119, + "rewards/format_reward/mean": 0.89609375, + "rewards/format_reward/std": 0.30500052571296693, + "rewards/qatch_metrics/mean": 0.5939271092414856, + "rewards/qatch_metrics/std": 0.4107288718223572, + "rewards/tag_count_reward/mean": 0.9341796875, + "rewards/tag_count_reward/std": 0.2010919064283371, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 2370.0, + "completions/max_terminated_length": 1083.0, + "completions/mean_length": 203.7921875, + "completions/mean_terminated_length": 197.71029968261718, + "completions/min_length": 41.8, + "completions/min_terminated_length": 41.8, + "epoch": 0.7227853680035258, + "grad_norm": 0.9498324772801405, + "kl": 0.112841796875, + "learning_rate": 1e-06, + "loss": 0.023, + "num_tokens": 44257808.0, + "reward": 0.6159097194671631, + "reward_std": 0.18089311718940734, + "rewards/format_reward/mean": 0.88828125, + "rewards/format_reward/std": 0.31010690331459045, + "rewards/qatch_metrics/mean": 0.5651557564735412, + "rewards/qatch_metrics/std": 0.407144832611084, + "rewards/tag_count_reward/mean": 0.933984375, + "rewards/tag_count_reward/std": 0.19542383253574372, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1977.8, + "completions/max_terminated_length": 646.4, + "completions/mean_length": 204.78828125, + "completions/mean_terminated_length": 192.57474365234376, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.7315998237108858, + "grad_norm": 0.8990363515461627, + "kl": 0.108154296875, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 44986913.0, + "reward": 0.5512337327003479, + "reward_std": 0.20042451322078705, + "rewards/format_reward/mean": 0.84296875, + "rewards/format_reward/std": 0.3632165014743805, + "rewards/qatch_metrics/mean": 0.4962239682674408, + "rewards/qatch_metrics/std": 0.4146161139011383, + "rewards/tag_count_reward/mean": 0.9029296875, + "rewards/tag_count_reward/std": 0.23338495790958405, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1267.8, + "completions/max_terminated_length": 545.2, + "completions/mean_length": 207.953125, + "completions/mean_terminated_length": 204.91087341308594, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.7404142794182459, + "grad_norm": 0.8949487847379094, + "kl": 0.1084228515625, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 45704165.0, + "reward": 0.5654355883598328, + "reward_std": 0.21594917476177217, + "rewards/format_reward/mean": 0.85703125, + "rewards/format_reward/std": 0.35001330375671386, + "rewards/qatch_metrics/mean": 0.5105997562408447, + "rewards/qatch_metrics/std": 0.42142562866210936, + "rewards/tag_count_reward/mean": 0.914453125, + "rewards/tag_count_reward/std": 0.22119783163070678, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1989.4, + "completions/max_terminated_length": 602.8, + "completions/mean_length": 194.7515625, + "completions/mean_terminated_length": 188.6415222167969, + "completions/min_length": 25.6, + "completions/min_terminated_length": 25.6, + "epoch": 0.749228735125606, + "grad_norm": 0.8443386346612493, + "kl": 0.11494140625, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 46430055.0, + "reward": 0.5958282589912415, + "reward_std": 0.19520920515060425, + "rewards/format_reward/mean": 0.87109375, + "rewards/format_reward/std": 0.3312843978404999, + "rewards/qatch_metrics/mean": 0.544264841079712, + "rewards/qatch_metrics/std": 0.4191899299621582, + "rewards/tag_count_reward/mean": 0.921875, + "rewards/tag_count_reward/std": 0.21222967505455018, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2666.0, + "completions/max_terminated_length": 567.2, + "completions/mean_length": 208.08203125, + "completions/mean_terminated_length": 195.87457275390625, + "completions/min_length": 32.6, + "completions/min_terminated_length": 32.6, + "epoch": 0.7580431908329661, + "grad_norm": 0.9046123034832724, + "kl": 0.1042724609375, + "learning_rate": 1e-06, + "loss": 0.0387, + "num_tokens": 47156176.0, + "reward": 0.6103429317474365, + "reward_std": 0.22614607214927673, + "rewards/format_reward/mean": 0.86640625, + "rewards/format_reward/std": 0.3397656261920929, + "rewards/qatch_metrics/mean": 0.5620072841644287, + "rewards/qatch_metrics/std": 0.4072328984737396, + "rewards/tag_count_reward/mean": 0.919921875, + "rewards/tag_count_reward/std": 0.21127038300037385, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1371.4, + "completions/max_terminated_length": 650.8, + "completions/mean_length": 200.70390625, + "completions/mean_terminated_length": 197.6367401123047, + "completions/min_length": 30.8, + "completions/min_terminated_length": 30.8, + "epoch": 0.7668576465403262, + "grad_norm": 0.8934328290702316, + "kl": 0.11044921875, + "learning_rate": 1e-06, + "loss": -0.0073, + "num_tokens": 47893749.0, + "reward": 0.526023668050766, + "reward_std": 0.22173346281051637, + "rewards/format_reward/mean": 0.89765625, + "rewards/format_reward/std": 0.3027026534080505, + "rewards/qatch_metrics/mean": 0.457994270324707, + "rewards/qatch_metrics/std": 0.4191239416599274, + "rewards/tag_count_reward/mean": 0.9392578125, + "rewards/tag_count_reward/std": 0.19010738730430604, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1307.0, + "completions/max_terminated_length": 619.6, + "completions/mean_length": 213.5671875, + "completions/mean_terminated_length": 207.5018341064453, + "completions/min_length": 33.6, + "completions/min_terminated_length": 33.6, + "epoch": 0.7756721022476862, + "grad_norm": 0.7709350601632311, + "kl": 0.1083984375, + "learning_rate": 1e-06, + "loss": 0.0201, + "num_tokens": 48637627.0, + "reward": 0.5134056210517883, + "reward_std": 0.19459065198898315, + "rewards/format_reward/mean": 0.8875, + "rewards/format_reward/std": 0.31534498929977417, + "rewards/qatch_metrics/mean": 0.44468907117843626, + "rewards/qatch_metrics/std": 0.3993754625320435, + "rewards/tag_count_reward/mean": 0.9333984375, + "rewards/tag_count_reward/std": 0.19436517655849456, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1255.0, + "completions/max_terminated_length": 573.8, + "completions/mean_length": 213.83359375, + "completions/mean_terminated_length": 210.8156005859375, + "completions/min_length": 29.8, + "completions/min_terminated_length": 29.8, + "epoch": 0.7844865579550463, + "grad_norm": 0.9730549470078724, + "kl": 0.11259765625, + "learning_rate": 1e-06, + "loss": 0.0194, + "num_tokens": 49400534.0, + "reward": 0.5392698287963867, + "reward_std": 0.21343457698822021, + "rewards/format_reward/mean": 0.89375, + "rewards/format_reward/std": 0.30757365822792054, + "rewards/qatch_metrics/mean": 0.4741064965724945, + "rewards/qatch_metrics/std": 0.42416965365409853, + "rewards/tag_count_reward/mean": 0.9380859375, + "rewards/tag_count_reward/std": 0.18792852461338044, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.8, + "completions/max_terminated_length": 657.8, + "completions/mean_length": 191.94375, + "completions/mean_terminated_length": 191.94375, + "completions/min_length": 28.4, + "completions/min_terminated_length": 28.4, + "epoch": 0.7933010136624064, + "grad_norm": 0.8068324955270447, + "kl": 0.12177734375, + "learning_rate": 1e-06, + "loss": -0.007, + "num_tokens": 50124014.0, + "reward": 0.5979775786399841, + "reward_std": 0.18400471210479735, + "rewards/format_reward/mean": 0.88125, + "rewards/format_reward/std": 0.32406928539276125, + "rewards/qatch_metrics/mean": 0.545139092206955, + "rewards/qatch_metrics/std": 0.40265028476715087, + "rewards/tag_count_reward/mean": 0.9296875, + "rewards/tag_count_reward/std": 0.19919731020927428, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1665.4, + "completions/max_terminated_length": 989.6, + "completions/mean_length": 191.44140625, + "completions/mean_terminated_length": 188.39669494628907, + "completions/min_length": 26.8, + "completions/min_terminated_length": 26.8, + "epoch": 0.8021154693697664, + "grad_norm": 0.9661639043459677, + "kl": 0.120654296875, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 50856579.0, + "reward": 0.5794390618801117, + "reward_std": 0.2076917439699173, + "rewards/format_reward/mean": 0.91953125, + "rewards/format_reward/std": 0.27112471759319307, + "rewards/qatch_metrics/mean": 0.5176190257072448, + "rewards/qatch_metrics/std": 0.41045997142791746, + "rewards/tag_count_reward/mean": 0.9501953125, + "rewards/tag_count_reward/std": 0.17092148661613465, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 578.6, + "completions/max_terminated_length": 578.6, + "completions/mean_length": 185.61171875, + "completions/mean_terminated_length": 185.61171875, + "completions/min_length": 31.8, + "completions/min_terminated_length": 31.8, + "epoch": 0.8109299250771265, + "grad_norm": 0.859237972466753, + "kl": 0.1249755859375, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 51568034.0, + "reward": 0.5465836644172668, + "reward_std": 0.17585654258728028, + "rewards/format_reward/mean": 0.93515625, + "rewards/format_reward/std": 0.24650255739688873, + "rewards/qatch_metrics/mean": 0.4766333520412445, + "rewards/qatch_metrics/std": 0.3998740196228027, + "rewards/tag_count_reward/mean": 0.95859375, + "rewards/tag_count_reward/std": 0.15964243412017823, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1242.8, + "completions/max_terminated_length": 579.6, + "completions/mean_length": 193.2140625, + "completions/mean_terminated_length": 190.17308959960937, + "completions/min_length": 37.8, + "completions/min_terminated_length": 37.8, + "epoch": 0.8197443807844865, + "grad_norm": 0.7384329915764564, + "kl": 0.112744140625, + "learning_rate": 1e-06, + "loss": -0.0108, + "num_tokens": 52261716.0, + "reward": 0.6089231491088867, + "reward_std": 0.18263671100139617, + "rewards/format_reward/mean": 0.92421875, + "rewards/format_reward/std": 0.26459681391716006, + "rewards/qatch_metrics/mean": 0.5516513049602508, + "rewards/qatch_metrics/std": 0.4096936106681824, + "rewards/tag_count_reward/mean": 0.951953125, + "rewards/tag_count_reward/std": 0.17155620753765105, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.8, + "completions/max_terminated_length": 591.8, + "completions/mean_length": 196.45390625, + "completions/mean_terminated_length": 196.45390625, + "completions/min_length": 31.2, + "completions/min_terminated_length": 31.2, + "epoch": 0.8285588364918466, + "grad_norm": 1.0025973293270143, + "kl": 0.1135009765625, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 52953113.0, + "reward": 0.6507824778556823, + "reward_std": 0.20129505693912506, + "rewards/format_reward/mean": 0.91484375, + "rewards/format_reward/std": 0.27930967807769774, + "rewards/qatch_metrics/mean": 0.6021958470344544, + "rewards/qatch_metrics/std": 0.3972749710083008, + "rewards/tag_count_reward/mean": 0.9486328125, + "rewards/tag_count_reward/std": 0.1716614156961441, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 213.11171875, + "completions/mean_terminated_length": 213.11171875, + "completions/min_length": 34.6, + "completions/min_terminated_length": 34.6, + "epoch": 0.8373732921992068, + "grad_norm": 0.8516678257406487, + "kl": 0.108935546875, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 53659128.0, + "reward": 0.5703884243965149, + "reward_std": 0.20974204540252686, + "rewards/format_reward/mean": 0.9171875, + "rewards/format_reward/std": 0.27448596358299254, + "rewards/qatch_metrics/mean": 0.5072354257106781, + "rewards/qatch_metrics/std": 0.42126131653785703, + "rewards/tag_count_reward/mean": 0.950390625, + "rewards/tag_count_reward/std": 0.16825708746910095, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1347.6, + "completions/max_terminated_length": 668.8, + "completions/mean_length": 221.3375, + "completions/mean_terminated_length": 215.31790161132812, + "completions/min_length": 35.8, + "completions/min_terminated_length": 35.8, + "epoch": 0.8461877479065668, + "grad_norm": 0.8676387330449279, + "kl": 0.1122314453125, + "learning_rate": 1e-06, + "loss": -0.0145, + "num_tokens": 54425080.0, + "reward": 0.5916900038719177, + "reward_std": 0.20206353664398194, + "rewards/format_reward/mean": 0.88203125, + "rewards/format_reward/std": 0.3205987274646759, + "rewards/qatch_metrics/mean": 0.5378453254699707, + "rewards/qatch_metrics/std": 0.41082814931869505, + "rewards/tag_count_reward/mean": 0.9263671875, + "rewards/tag_count_reward/std": 0.20377787947654724, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1610.6, + "completions/max_terminated_length": 923.2, + "completions/mean_length": 224.6875, + "completions/mean_terminated_length": 221.65450744628907, + "completions/min_length": 27.4, + "completions/min_terminated_length": 27.4, + "epoch": 0.8550022036139269, + "grad_norm": 0.7904555962961125, + "kl": 0.109765625, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 55227528.0, + "reward": 0.5810864806175232, + "reward_std": 0.2380138784646988, + "rewards/format_reward/mean": 0.853125, + "rewards/format_reward/std": 0.3541332304477692, + "rewards/qatch_metrics/mean": 0.5300695478916169, + "rewards/qatch_metrics/std": 0.4341892719268799, + "rewards/tag_count_reward/mean": 0.904296875, + "rewards/tag_count_reward/std": 0.23573453426361085, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00234375, + "completions/max_length": 1990.4, + "completions/max_terminated_length": 623.2, + "completions/mean_length": 218.89296875, + "completions/mean_terminated_length": 209.80899353027343, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 0.8638166593212869, + "grad_norm": 0.8342126282923776, + "kl": 0.106396484375, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 55974703.0, + "reward": 0.5714076519012451, + "reward_std": 0.20491171181201934, + "rewards/format_reward/mean": 0.8515625, + "rewards/format_reward/std": 0.35569257140159605, + "rewards/qatch_metrics/mean": 0.5185333371162415, + "rewards/qatch_metrics/std": 0.41171206831932067, + "rewards/tag_count_reward/mean": 0.9099609375, + "rewards/tag_count_reward/std": 0.22593727111816406, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1373.0, + "completions/max_terminated_length": 707.4, + "completions/mean_length": 209.70390625, + "completions/mean_terminated_length": 203.64548950195314, + "completions/min_length": 37.2, + "completions/min_terminated_length": 37.2, + "epoch": 0.872631115028647, + "grad_norm": 0.8070043867292849, + "kl": 0.1049560546875, + "learning_rate": 1e-06, + "loss": -0.0094, + "num_tokens": 56709412.0, + "reward": 0.6168586254119873, + "reward_std": 0.20435989499092103, + "rewards/format_reward/mean": 0.85546875, + "rewards/format_reward/std": 0.34868985414505005, + "rewards/qatch_metrics/mean": 0.5714651107788086, + "rewards/qatch_metrics/std": 0.42900125980377196, + "rewards/tag_count_reward/mean": 0.911328125, + "rewards/tag_count_reward/std": 0.22259356081485748, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1373.4, + "completions/max_terminated_length": 661.4, + "completions/mean_length": 218.89609375, + "completions/mean_terminated_length": 215.85523681640626, + "completions/min_length": 38.6, + "completions/min_terminated_length": 38.6, + "epoch": 0.881445570736007, + "grad_norm": 0.8578105605653167, + "kl": 0.1009765625, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 57464591.0, + "reward": 0.5597202479839325, + "reward_std": 0.22529322803020477, + "rewards/format_reward/mean": 0.865625, + "rewards/format_reward/std": 0.3411052882671356, + "rewards/qatch_metrics/mean": 0.5026809990406036, + "rewards/qatch_metrics/std": 0.4211664915084839, + "rewards/tag_count_reward/mean": 0.917578125, + "rewards/tag_count_reward/std": 0.21666719317436217, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1334.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 209.959375, + "completions/mean_terminated_length": 203.84853515625, + "completions/min_length": 26.2, + "completions/min_terminated_length": 26.2, + "epoch": 0.8902600264433671, + "grad_norm": 0.8021142913277886, + "kl": 0.108203125, + "learning_rate": 1e-06, + "loss": -0.0275, + "num_tokens": 58213131.0, + "reward": 0.5699510633945465, + "reward_std": 0.2101448118686676, + "rewards/format_reward/mean": 0.82578125, + "rewards/format_reward/std": 0.3783671915531158, + "rewards/qatch_metrics/mean": 0.5208638191223145, + "rewards/qatch_metrics/std": 0.41592952609062195, + "rewards/tag_count_reward/mean": 0.8927734375, + "rewards/tag_count_reward/std": 0.2440448522567749, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 2336.6, + "completions/max_terminated_length": 925.2, + "completions/mean_length": 245.92890625, + "completions/mean_terminated_length": 230.83008422851563, + "completions/min_length": 29.8, + "completions/min_terminated_length": 29.8, + "epoch": 0.8990744821507272, + "grad_norm": 0.7883812832224967, + "kl": 0.100732421875, + "learning_rate": 1e-06, + "loss": 0.0226, + "num_tokens": 59018080.0, + "reward": 0.5421915054321289, + "reward_std": 0.21591668128967284, + "rewards/format_reward/mean": 0.80546875, + "rewards/format_reward/std": 0.3959254801273346, + "rewards/qatch_metrics/mean": 0.49160627126693723, + "rewards/qatch_metrics/std": 0.41863099932670594, + "rewards/tag_count_reward/mean": 0.8755859375, + "rewards/tag_count_reward/std": 0.2644981533288956, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 618.4, + "completions/max_terminated_length": 618.4, + "completions/mean_length": 208.48984375, + "completions/mean_terminated_length": 208.48984375, + "completions/min_length": 34.8, + "completions/min_terminated_length": 34.8, + "epoch": 0.9078889378580872, + "grad_norm": 0.903154309567232, + "kl": 0.111083984375, + "learning_rate": 1e-06, + "loss": -0.0072, + "num_tokens": 59739139.0, + "reward": 0.632229495048523, + "reward_std": 0.19765791296958923, + "rewards/format_reward/mean": 0.83125, + "rewards/format_reward/std": 0.368955659866333, + "rewards/qatch_metrics/mean": 0.593443238735199, + "rewards/qatch_metrics/std": 0.4310678899288177, + "rewards/tag_count_reward/mean": 0.8935546875, + "rewards/tag_count_reward/std": 0.23964128494262696, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1252.4, + "completions/max_terminated_length": 611.6, + "completions/mean_length": 212.459375, + "completions/mean_terminated_length": 209.4299072265625, + "completions/min_length": 33.4, + "completions/min_terminated_length": 33.4, + "epoch": 0.9167033935654474, + "grad_norm": 0.8346843956683994, + "kl": 0.1077392578125, + "learning_rate": 1e-06, + "loss": 0.0163, + "num_tokens": 60466559.0, + "reward": 0.5584656774997712, + "reward_std": 0.23274661898612975, + "rewards/format_reward/mean": 0.8796875, + "rewards/format_reward/std": 0.32524962425231935, + "rewards/qatch_metrics/mean": 0.4991369664669037, + "rewards/qatch_metrics/std": 0.41381397247314455, + "rewards/tag_count_reward/mean": 0.924609375, + "rewards/tag_count_reward/std": 0.21089179813861847, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00078125, + "completions/max_length": 1248.4, + "completions/max_terminated_length": 638.4, + "completions/mean_length": 210.87109375, + "completions/mean_terminated_length": 207.83172912597655, + "completions/min_length": 28.4, + "completions/min_terminated_length": 28.4, + "epoch": 0.9255178492728074, + "grad_norm": 0.9740281006839744, + "kl": 0.1169189453125, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 61228986.0, + "reward": 0.5737495183944702, + "reward_std": 0.221232670545578, + "rewards/format_reward/mean": 0.8328125, + "rewards/format_reward/std": 0.3710750341415405, + "rewards/qatch_metrics/mean": 0.5245283961296081, + "rewards/qatch_metrics/std": 0.42740072011947633, + "rewards/tag_count_reward/mean": 0.8923828125, + "rewards/tag_count_reward/std": 0.24619007110595703, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00234375, + "completions/max_length": 2158.4, + "completions/max_terminated_length": 753.4, + "completions/mean_length": 201.10390625, + "completions/mean_terminated_length": 191.97602233886718, + "completions/min_length": 21.4, + "completions/min_terminated_length": 21.4, + "epoch": 0.9343323049801675, + "grad_norm": 0.8087287498541261, + "kl": 0.1182373046875, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 61969759.0, + "reward": 0.5955200791358948, + "reward_std": 0.2014760673046112, + "rewards/format_reward/mean": 0.84140625, + "rewards/format_reward/std": 0.35906914472579954, + "rewards/qatch_metrics/mean": 0.5486127734184265, + "rewards/qatch_metrics/std": 0.39445692896842954, + "rewards/tag_count_reward/mean": 0.901171875, + "rewards/tag_count_reward/std": 0.23216934502124786, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 1311.6, + "completions/max_terminated_length": 636.6, + "completions/mean_length": 200.63671875, + "completions/mean_terminated_length": 194.58396911621094, + "completions/min_length": 31.6, + "completions/min_terminated_length": 31.6, + "epoch": 0.9431467606875276, + "grad_norm": 1.0244359727988313, + "kl": 0.1111083984375, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 62673742.0, + "reward": 0.5657651543617248, + "reward_std": 0.18060422837734222, + "rewards/format_reward/mean": 0.88671875, + "rewards/format_reward/std": 0.3158248126506805, + "rewards/qatch_metrics/mean": 0.5065067887306214, + "rewards/qatch_metrics/std": 0.3941995918750763, + "rewards/tag_count_reward/mean": 0.93125, + "rewards/tag_count_reward/std": 0.1997167259454727, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00234375, + "completions/max_length": 1975.2, + "completions/max_terminated_length": 528.8, + "completions/mean_length": 192.11484375, + "completions/mean_terminated_length": 182.94750061035157, + "completions/min_length": 21.8, + "completions/min_terminated_length": 21.8, + "epoch": 0.9519612163948876, + "grad_norm": 0.9450830973150219, + "kl": 0.106689453125, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 63388433.0, + "reward": 0.5951115846633911, + "reward_std": 0.19603927731513976, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.34651567935943606, + "rewards/qatch_metrics/mean": 0.5458343744277954, + "rewards/qatch_metrics/std": 0.4278919756412506, + "rewards/tag_count_reward/mean": 0.904296875, + "rewards/tag_count_reward/std": 0.24298664927482605, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00234375, + "completions/max_length": 1540.4, + "completions/max_terminated_length": 1090.8, + "completions/mean_length": 190.159375, + "completions/mean_terminated_length": 181.03360290527343, + "completions/min_length": 35.8, + "completions/min_terminated_length": 35.8, + "epoch": 0.9607756721022477, + "grad_norm": 0.9291286887400723, + "kl": 0.1108642578125, + "learning_rate": 1e-06, + "loss": 0.0375, + "num_tokens": 64092413.0, + "reward": 0.6795460700988769, + "reward_std": 0.20811468064785005, + "rewards/format_reward/mean": 0.91328125, + "rewards/format_reward/std": 0.280303093791008, + "rewards/qatch_metrics/mean": 0.6362651109695434, + "rewards/qatch_metrics/std": 0.4117369055747986, + "rewards/tag_count_reward/mean": 0.9478515625, + "rewards/tag_count_reward/std": 0.18190329372882844, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.4, + "completions/max_terminated_length": 560.4, + "completions/mean_length": 181.5375, + "completions/mean_terminated_length": 181.5375, + "completions/min_length": 35.6, + "completions/min_terminated_length": 35.6, + "epoch": 0.9695901278096077, + "grad_norm": 0.9490785592275803, + "kl": 0.1166015625, + "learning_rate": 1e-06, + "loss": -0.0105, + "num_tokens": 64759389.0, + "reward": 0.577846372127533, + "reward_std": 0.19823800325393676, + "rewards/format_reward/mean": 0.91328125, + "rewards/format_reward/std": 0.28074146509170533, + "rewards/qatch_metrics/mean": 0.5166414439678192, + "rewards/qatch_metrics/std": 0.4310955286026001, + "rewards/tag_count_reward/mean": 0.9474609375, + "rewards/tag_count_reward/std": 0.1789928376674652, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0015625, + "completions/max_length": 2067.2, + "completions/max_terminated_length": 755.6, + "completions/mean_length": 197.115625, + "completions/mean_terminated_length": 191.02464904785157, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.9784045835169678, + "grad_norm": 0.90442977548872, + "kl": 0.108544921875, + "learning_rate": 1e-06, + "loss": 0.0203, + "num_tokens": 65477873.0, + "reward": 0.5961843609809876, + "reward_std": 0.20585475862026215, + "rewards/format_reward/mean": 0.92421875, + "rewards/format_reward/std": 0.2640294134616852, + "rewards/qatch_metrics/mean": 0.5363083481788635, + "rewards/qatch_metrics/std": 0.41726168990135193, + "rewards/tag_count_reward/mean": 0.9580078125, + "rewards/tag_count_reward/std": 0.1562621772289276, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 3376.8, + "completions/max_terminated_length": 587.8, + "completions/mean_length": 195.74296875, + "completions/mean_terminated_length": 183.5094757080078, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "epoch": 0.9872190392243279, + "grad_norm": 0.9502895043232452, + "kl": 0.112939453125, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 66214488.0, + "reward": 0.6302931666374206, + "reward_std": 0.21948930323123933, + "rewards/format_reward/mean": 0.88515625, + "rewards/format_reward/std": 0.31716270446777345, + "rewards/qatch_metrics/mean": 0.5827322959899902, + "rewards/qatch_metrics/std": 0.4163429081439972, + "rewards/tag_count_reward/mean": 0.9291015625, + "rewards/tag_count_reward/std": 0.20648659765720367, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 3388.8, + "completions/max_terminated_length": 552.8, + "completions/mean_length": 203.7375, + "completions/mean_terminated_length": 191.53135375976564, + "completions/min_length": 29.8, + "completions/min_terminated_length": 29.8, + "epoch": 0.996033494931688, + "grad_norm": 1.0451866474159504, + "kl": 0.110400390625, + "learning_rate": 1e-06, + "loss": 0.0071, + "num_tokens": 66948152.0, + "reward": 0.5372519016265869, + "reward_std": 0.20013673603534698, + "rewards/format_reward/mean": 0.8640625, + "rewards/format_reward/std": 0.34239274859428404, + "rewards/qatch_metrics/mean": 0.4767416715621948, + "rewards/qatch_metrics/std": 0.3842666923999786, + "rewards/tag_count_reward/mean": 0.9123046875, + "rewards/tag_count_reward/std": 0.23215168714523315, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 2463.5, + "completions/max_terminated_length": 763.0, + "completions/mean_length": 191.5, + "completions/mean_terminated_length": 183.86861419677734, + "completions/min_length": 30.5, + "completions/min_terminated_length": 30.5, + "epoch": 0.999559277214632, + "kl": 0.11181640625, + "num_tokens": 67212456.0, + "reward": 0.6837565302848816, + "reward_std": 0.19374996423721313, + "rewards/format_reward/mean": 0.8828125, + "rewards/format_reward/std": 0.32204362750053406, + "rewards/qatch_metrics/mean": 0.6466471254825592, + "rewards/qatch_metrics/std": 0.3814842849969864, + "rewards/tag_count_reward/mean": 0.91650390625, + "rewards/tag_count_reward/std": 0.2326364442706108, + "step": 567, + "total_flos": 0.0, + "train_loss": 0.0005930395028184331, + "train_runtime": 32371.3135, + "train_samples_per_second": 0.28, + "train_steps_per_second": 0.018 + } + ], + "logging_steps": 5, + "max_steps": 567, + "num_input_tokens_seen": 67212456, + "num_train_epochs": 1, + "save_steps": 5, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}