{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999559277214632, "eval_steps": 500, "global_step": 567, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "clipped_completions_ratio": 0.0078125, "epoch": 0.0017628911414720142, "grad_norm": 3.0840760424820304, "kl": 0.0, "learning_rate": 1.7543859649122805e-08, "loss": -0.0042, "max_completion_length": 464.0, "max_terminated_completion_length": 459.75, "mean_completion_length": 120.04296875, "mean_terminated_completion_length": 118.23617553710938, "min_completion_length": 21.0, "min_terminated_completion_length": 21.0, "num_tokens": 115211.0, "reward": 0.25845247507095337, "reward_std": 0.24694325402379036, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.2897005267441273, "rewards/qatch_metrics/std": 0.37457581236958504, "rewards/tag_count_reward/mean": 0.244140625, "rewards/tag_count_reward/std": 0.13581550493836403, "step": 1 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.0107421875, "epoch": 0.00881445570736007, "grad_norm": 2.803414301509709, "kl": 0.00023472309112548828, "learning_rate": 8.771929824561403e-08, "loss": 0.0299, "max_completion_length": 2148.625, "max_terminated_completion_length": 560.25, "mean_completion_length": 164.86328125, "mean_terminated_completion_length": 122.24907398223877, "min_completion_length": 23.3125, "min_terminated_completion_length": 23.3125, "num_tokens": 658751.0, "reward": 0.14076079020742327, "reward_std": 0.17357240640558302, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.15179980779066682, "rewards/qatch_metrics/std": 0.285268085077405, "rewards/tag_count_reward/mean": 0.234619140625, "rewards/tag_count_reward/std": 0.11686475621536374, "step": 5 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.01015625, "epoch": 0.01762891141472014, "grad_norm": 2.555865364836083, "kl": 0.00034499168395996094, "learning_rate": 1.7543859649122805e-07, "loss": 0.0691, "max_completion_length": 1847.5, "max_terminated_completion_length": 736.65, "mean_completion_length": 154.3125, "mean_terminated_completion_length": 130.3932632446289, "min_completion_length": 21.3, "min_terminated_completion_length": 21.3, "num_tokens": 1347167.0, "reward": 0.12399922087788581, "reward_std": 0.16416746266186238, "rewards/format_reward/mean": 0.00078125, "rewards/format_reward/std": 0.00625, "rewards/qatch_metrics/mean": 0.13194531546905636, "rewards/qatch_metrics/std": 0.2816271550953388, "rewards/tag_count_reward/mean": 0.2353515625, "rewards/tag_count_reward/std": 0.11751417592167854, "step": 10 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.01328125, "epoch": 0.026443367122080213, "grad_norm": 1.8510134641989897, "kl": 0.00035467147827148435, "learning_rate": 2.631578947368421e-07, "loss": 0.1161, "max_completion_length": 1736.25, "max_terminated_completion_length": 725.25, "mean_completion_length": 166.71015625, "mean_terminated_completion_length": 125.76015815734863, "min_completion_length": 22.4, "min_terminated_completion_length": 22.4, "num_tokens": 1999516.0, "reward": 0.13328830637037753, "reward_std": 0.18279453851282595, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.1429885433986783, "rewards/qatch_metrics/std": 0.2840505912899971, "rewards/tag_count_reward/mean": 0.2349609375, "rewards/tag_count_reward/std": 0.12176873050630092, "step": 15 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.0109375, "epoch": 0.03525782282944028, "grad_norm": 2.5276083322305163, "kl": 0.0008690834045410156, "learning_rate": 3.508771929824561e-07, "loss": 0.0909, "max_completion_length": 1950.25, "max_terminated_completion_length": 694.35, "mean_completion_length": 157.1859375, "mean_terminated_completion_length": 122.76734085083008, "min_completion_length": 22.95, "min_terminated_completion_length": 22.95, "num_tokens": 2662298.0, "reward": 0.19042691607028245, "reward_std": 0.19785099737346173, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.20991172092035412, "rewards/qatch_metrics/std": 0.32454456612467764, "rewards/tag_count_reward/mean": 0.2400390625, "rewards/tag_count_reward/std": 0.11891286894679069, "step": 20 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.00390625, "epoch": 0.044072278536800354, "grad_norm": 1.6421641387454096, "kl": 0.002597618103027344, "learning_rate": 4.3859649122807013e-07, "loss": 0.0598, "max_completion_length": 1248.55, "max_terminated_completion_length": 525.75, "mean_completion_length": 117.29296875, "mean_terminated_completion_length": 104.7642993927002, "min_completion_length": 21.5, "min_terminated_completion_length": 21.5, "num_tokens": 3274689.0, "reward": 0.19326679892838, "reward_std": 0.19503218345344067, "rewards/format_reward/mean": 0.00234375, "rewards/format_reward/std": 0.01875, "rewards/qatch_metrics/mean": 0.21241406723856926, "rewards/qatch_metrics/std": 0.3405905418097973, "rewards/tag_count_reward/mean": 0.249609375, "rewards/tag_count_reward/std": 0.11380729898810386, "step": 25 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.00546875, "epoch": 0.052886734244160426, "grad_norm": 2.327163237963173, "kl": 0.004604721069335937, "learning_rate": 5.263157894736842e-07, "loss": -0.0155, "max_completion_length": 1373.3, "max_terminated_completion_length": 461.75, "mean_completion_length": 126.6109375, "mean_terminated_completion_length": 110.83255004882812, "min_completion_length": 21.45, "min_terminated_completion_length": 21.45, "num_tokens": 3903935.0, "reward": 0.235529076308012, "reward_std": 0.19208679497241973, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.26164037082344294, "rewards/qatch_metrics/std": 0.35552939809858797, "rewards/tag_count_reward/mean": 0.2626953125, "rewards/tag_count_reward/std": 0.12861518152058124, "step": 30 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.003125, "epoch": 0.06170118995152049, "grad_norm": 1.9030107685120632, "kl": 0.006869125366210938, "learning_rate": 6.140350877192982e-07, "loss": -0.0291, "max_completion_length": 637.45, "max_terminated_completion_length": 475.05, "mean_completion_length": 110.1609375, "mean_terminated_completion_length": 106.66331939697265, "min_completion_length": 19.1, "min_terminated_completion_length": 19.1, "num_tokens": 4489885.0, "reward": 0.23229851759970188, "reward_std": 0.21139583457261324, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.2574031319469213, "rewards/qatch_metrics/std": 0.3460362754762173, "rewards/tag_count_reward/mean": 0.2701171875, "rewards/tag_count_reward/std": 0.12943687103688717, "step": 35 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.0015625, "epoch": 0.07051564565888056, "grad_norm": 1.3133674580676309, "kl": 0.006020355224609375, "learning_rate": 7.017543859649122e-07, "loss": -0.0528, "max_completion_length": 738.35, "max_terminated_completion_length": 558.2, "mean_completion_length": 145.91328125, "mean_terminated_completion_length": 142.51878776550294, "min_completion_length": 21.1, "min_terminated_completion_length": 21.1, "num_tokens": 5151902.0, "reward": 0.22980495262891054, "reward_std": 0.18255550526082515, "rewards/format_reward/mean": 0.00078125, "rewards/format_reward/std": 0.00625, "rewards/qatch_metrics/mean": 0.25320573393255474, "rewards/qatch_metrics/std": 0.3738796763122082, "rewards/tag_count_reward/mean": 0.2900390625, "rewards/tag_count_reward/std": 0.15516266897320746, "step": 40 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.0015625, "epoch": 0.07933010136624064, "grad_norm": 1.065001099510075, "kl": 0.005857086181640625, "learning_rate": 7.894736842105263e-07, "loss": 0.0403, "max_completion_length": 674.15, "max_terminated_completion_length": 669.1, "mean_completion_length": 172.17578125, "mean_terminated_completion_length": 171.6076644897461, "min_completion_length": 20.2, "min_terminated_completion_length": 20.2, "num_tokens": 5870847.0, "reward": 0.23959124982357025, "reward_std": 0.21271923929452896, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/qatch_metrics/mean": 0.26344375535845754, "rewards/qatch_metrics/std": 0.36381270438432695, "rewards/tag_count_reward/mean": 0.31328125, "rewards/tag_count_reward/std": 0.16481443196535112, "step": 45 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.00390625, "epoch": 0.08814455707360071, "grad_norm": 1.4568973490349373, "kl": 0.005655670166015625, "learning_rate": 8.771929824561403e-07, "loss": -0.0555, "max_completion_length": 1026.65, "max_terminated_completion_length": 500.6, "mean_completion_length": 171.38203125, "mean_terminated_completion_length": 158.90636672973633, "min_completion_length": 20.45, "min_terminated_completion_length": 20.45, "num_tokens": 6568024.0, "reward": 0.22149190343916417, "reward_std": 0.21185415983200073, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.027518405020236968, "rewards/qatch_metrics/mean": 0.24082917235791684, "rewards/qatch_metrics/std": 0.36300159245729446, "rewards/tag_count_reward/mean": 0.3279296875, "rewards/tag_count_reward/std": 0.16671581640839578, "step": 50 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.00390625, "epoch": 0.09695901278096078, "grad_norm": 1.0074202050705572, "kl": 0.007281494140625, "learning_rate": 9.649122807017545e-07, "loss": 0.0364, "max_completion_length": 1240.75, "max_terminated_completion_length": 517.85, "mean_completion_length": 194.9171875, "mean_terminated_completion_length": 182.60486450195313, "min_completion_length": 23.85, "min_terminated_completion_length": 23.85, "num_tokens": 7285134.0, "reward": 0.2235423892736435, "reward_std": 0.21293668523430825, "rewards/format_reward/mean": 0.00859375, "rewards/format_reward/std": 0.053823620080947876, "rewards/qatch_metrics/mean": 0.24047266095876693, "rewards/qatch_metrics/std": 0.35300029441714287, "rewards/tag_count_reward/mean": 0.365625, "rewards/tag_count_reward/std": 0.17709428519010545, "step": 55 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.00625, "epoch": 0.10577346848832085, "grad_norm": 0.9807961478644736, "kl": 0.01302337646484375, "learning_rate": 1e-06, "loss": -0.0139, "max_completion_length": 1103.85, "max_terminated_completion_length": 560.8, "mean_completion_length": 248.34296875, "mean_terminated_completion_length": 238.09479522705078, "min_completion_length": 38.0, "min_terminated_completion_length": 38.0, "num_tokens": 8039813.0, "reward": 0.23491878062486649, "reward_std": 0.21809776537120343, "rewards/format_reward/mean": 0.01953125, "rewards/format_reward/std": 0.11232657507061958, "rewards/qatch_metrics/mean": 0.24687135666608812, "rewards/qatch_metrics/std": 0.3717411242425442, "rewards/tag_count_reward/mean": 0.4625, "rewards/tag_count_reward/std": 0.2135901317000389, "step": 60 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.00234375, "epoch": 0.11458792419568092, "grad_norm": 0.8603200985963705, "kl": 0.01912841796875, "learning_rate": 1e-06, "loss": 0.0333, "max_completion_length": 497.9, "max_terminated_completion_length": 495.0, "mean_completion_length": 230.2484375, "mean_terminated_completion_length": 229.92258987426757, "min_completion_length": 42.5, "min_terminated_completion_length": 42.5, "num_tokens": 8806307.0, "reward": 0.24188727661967277, "reward_std": 0.216167426854372, "rewards/format_reward/mean": 0.0640625, "rewards/format_reward/std": 0.22365741804242134, "rewards/qatch_metrics/mean": 0.24616562593728303, "rewards/qatch_metrics/std": 0.3404053032398224, "rewards/tag_count_reward/mean": 0.5248046875, "rewards/tag_count_reward/std": 0.23647152334451677, "step": 65 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.0046875, "epoch": 0.12340237990304098, "grad_norm": 0.9335750466856336, "kl": 0.02579345703125, "learning_rate": 1e-06, "loss": 0.0811, "max_completion_length": 1368.45, "max_terminated_completion_length": 485.4, "mean_completion_length": 222.75390625, "mean_terminated_completion_length": 207.36048126220703, "min_completion_length": 34.7, "min_terminated_completion_length": 34.7, "num_tokens": 9576424.0, "reward": 0.24611930586397648, "reward_std": 0.22903760597109796, "rewards/format_reward/mean": 0.134375, "rewards/format_reward/std": 0.3337091006338596, "rewards/qatch_metrics/mean": 0.24012656770646573, "rewards/qatch_metrics/std": 0.33230473324656484, "rewards/tag_count_reward/mean": 0.571484375, "rewards/tag_count_reward/std": 0.26808963865041735, "step": 70 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.00234375, "epoch": 0.13221683561040107, "grad_norm": 1.2284202473881727, "kl": 0.0276123046875, "learning_rate": 1e-06, "loss": 0.0495, "max_completion_length": 927.9, "max_terminated_completion_length": 564.7, "mean_completion_length": 181.3296875, "mean_terminated_completion_length": 172.18771896362304, "min_completion_length": 23.45, "min_terminated_completion_length": 23.45, "num_tokens": 10314206.0, "reward": 0.24534607045352458, "reward_std": 0.2094151984900236, "rewards/format_reward/mean": 0.23828125, "rewards/format_reward/std": 0.4178183376789093, "rewards/qatch_metrics/mean": 0.22585521470755338, "rewards/qatch_metrics/std": 0.32939945682883265, "rewards/tag_count_reward/mean": 0.5908203125, "rewards/tag_count_reward/std": 0.2986594527959824, "step": 75 }, { "clip_ratio": 0.0, "clipped_completions_ratio": 0.00390625, "epoch": 0.14103129131776113, "grad_norm": 1.142379485289819, "kl": 0.051806640625, "learning_rate": 1e-06, "loss": 0.0794, "max_completion_length": 725.95, "max_terminated_completion_length": 543.8, "mean_completion_length": 172.446875, "mean_terminated_completion_length": 165.8302963256836, "min_completion_length": 39.55, "min_terminated_completion_length": 39.55, "num_tokens": 11001674.0, "reward": 0.3384398899972439, "reward_std": 0.24649502858519554, "rewards/format_reward/mean": 0.54296875, "rewards/format_reward/std": 0.4884683877229691, "rewards/qatch_metrics/mean": 0.28772110007703305, "rewards/qatch_metrics/std": 0.38454234302043916, "rewards/tag_count_reward/mean": 0.7916015625, "rewards/tag_count_reward/std": 0.2731324777007103, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2780.4, "completions/max_terminated_length": 941.2, "completions/mean_length": 178.7609375, "completions/mean_terminated_length": 163.41346435546876, "completions/min_length": 26.8, "completions/min_terminated_length": 26.8, "epoch": 0.1498457470251212, "grad_norm": 2.58777855961075, "kl": 0.07294921875, "learning_rate": 1e-06, "loss": 0.1454, "num_tokens": 717486.0, "reward": 0.312135910987854, "reward_std": 0.2266964465379715, "rewards/format_reward/mean": 0.77265625, "rewards/format_reward/std": 0.4172531723976135, "rewards/qatch_metrics/mean": 0.22495078444480895, "rewards/qatch_metrics/std": 0.3556622087955475, "rewards/tag_count_reward/mean": 0.8732421875, "rewards/tag_count_reward/std": 0.25258718729019164, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.6, "completions/max_terminated_length": 743.6, "completions/mean_length": 140.06875, "completions/mean_terminated_length": 140.06875, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 0.15866020273248127, "grad_norm": 1.3938857270995895, "kl": 0.0706787109375, "learning_rate": 1e-06, "loss": 0.0265, "num_tokens": 1363254.0, "reward": 0.3290148377418518, "reward_std": 0.220520544052124, "rewards/format_reward/mean": 0.75703125, "rewards/format_reward/std": 0.4248744070529938, "rewards/qatch_metrics/mean": 0.24871459007263183, "rewards/qatch_metrics/std": 0.367422616481781, "rewards/tag_count_reward/mean": 0.8380859375, "rewards/tag_count_reward/std": 0.2936785161495209, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2624.8, "completions/max_terminated_length": 1089.0, "completions/mean_length": 144.7390625, "completions/mean_terminated_length": 132.38715515136718, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.16747465843984133, "grad_norm": 1.271013018051317, "kl": 0.0874755859375, "learning_rate": 1e-06, "loss": 0.1072, "num_tokens": 1984568.0, "reward": 0.4227922260761261, "reward_std": 0.22174089550971984, "rewards/format_reward/mean": 0.9515625, "rewards/format_reward/std": 0.21198658645153046, "rewards/qatch_metrics/mean": 0.32884793281555175, "rewards/qatch_metrics/std": 0.411483907699585, "rewards/tag_count_reward/mean": 0.9623046875, "rewards/tag_count_reward/std": 0.15252943634986876, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1262.2, "completions/max_terminated_length": 539.4, "completions/mean_length": 135.00625, "completions/mean_terminated_length": 128.81027221679688, "completions/min_length": 37.4, "completions/min_terminated_length": 37.4, "epoch": 0.17628911414720141, "grad_norm": 1.4288833040090947, "kl": 0.093994140625, "learning_rate": 1e-06, "loss": 0.0551, "num_tokens": 2650624.0, "reward": 0.3893065094947815, "reward_std": 0.21477862894535066, "rewards/format_reward/mean": 0.96171875, "rewards/format_reward/std": 0.1881812334060669, "rewards/qatch_metrics/mean": 0.28727005124092103, "rewards/qatch_metrics/std": 0.3896294891834259, "rewards/tag_count_reward/mean": 0.9791015625, "rewards/tag_count_reward/std": 0.11189484894275666, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1890.2, "completions/max_terminated_length": 462.4, "completions/mean_length": 132.55703125, "completions/mean_terminated_length": 120.11187286376953, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.18510356985456147, "grad_norm": 1.1514380769030061, "kl": 0.09775390625, "learning_rate": 1e-06, "loss": 0.0498, "num_tokens": 3266889.0, "reward": 0.3714154362678528, "reward_std": 0.1868872672319412, "rewards/format_reward/mean": 0.9484375, "rewards/format_reward/std": 0.21911896765232086, "rewards/qatch_metrics/mean": 0.2678416669368744, "rewards/qatch_metrics/std": 0.36030757427215576, "rewards/tag_count_reward/mean": 0.978125, "rewards/tag_count_reward/std": 0.10167990401387214, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 2149.8, "completions/max_terminated_length": 642.6, "completions/mean_length": 131.7640625, "completions/mean_terminated_length": 122.44811248779297, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.19391802556192156, "grad_norm": 1.1648416805076183, "kl": 0.0974853515625, "learning_rate": 1e-06, "loss": 0.1006, "num_tokens": 3884011.0, "reward": 0.45167279839515684, "reward_std": 0.2551474153995514, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.2396955519914627, "rewards/qatch_metrics/mean": 0.3636867344379425, "rewards/qatch_metrics/std": 0.4195810675621033, "rewards/tag_count_reward/mean": 0.97578125, "rewards/tag_count_reward/std": 0.10661737024784088, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1134.6, "completions/max_terminated_length": 383.0, "completions/mean_length": 126.67265625, "completions/mean_terminated_length": 123.56720428466797, "completions/min_length": 32.8, "completions/min_terminated_length": 32.8, "epoch": 0.20273248126928162, "grad_norm": 1.112770785107892, "kl": 0.0981201171875, "learning_rate": 1e-06, "loss": 0.0582, "num_tokens": 4527864.0, "reward": 0.45048635005950927, "reward_std": 0.24212915897369386, "rewards/format_reward/mean": 0.90234375, "rewards/format_reward/std": 0.2969411134719849, "rewards/qatch_metrics/mean": 0.3671622335910797, "rewards/qatch_metrics/std": 0.4069118857383728, "rewards/tag_count_reward/mean": 0.96328125, "rewards/tag_count_reward/std": 0.12083393186330796, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2718.2, "completions/max_terminated_length": 666.6, "completions/mean_length": 135.815625, "completions/mean_terminated_length": 123.39766387939453, "completions/min_length": 40.6, "completions/min_terminated_length": 40.6, "epoch": 0.2115469369766417, "grad_norm": 2.8196404883813453, "kl": 0.10830078125, "learning_rate": 1e-06, "loss": 0.0955, "num_tokens": 5204604.0, "reward": 0.4444663166999817, "reward_std": 0.22749231457710267, "rewards/format_reward/mean": 0.9078125, "rewards/format_reward/std": 0.28959383964538576, "rewards/qatch_metrics/mean": 0.3591492176055908, "rewards/qatch_metrics/std": 0.4204003632068634, "rewards/tag_count_reward/mean": 0.9681640625, "rewards/tag_count_reward/std": 0.11976957470178604, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2639.8, "completions/max_terminated_length": 459.2, "completions/mean_length": 147.5625, "completions/mean_terminated_length": 135.18407287597657, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.22036139268400176, "grad_norm": 1.2443606977665935, "kl": 0.09527587890625, "learning_rate": 1e-06, "loss": 0.0903, "num_tokens": 5850844.0, "reward": 0.4391818165779114, "reward_std": 0.24111129343509674, "rewards/format_reward/mean": 0.88359375, "rewards/format_reward/std": 0.32070607542991636, "rewards/qatch_metrics/mean": 0.35628697872161863, "rewards/qatch_metrics/std": 0.41108678579330443, "rewards/tag_count_reward/mean": 0.9595703125, "rewards/tag_count_reward/std": 0.1286213666200638, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 2638.2, "completions/max_terminated_length": 729.4, "completions/mean_length": 148.103125, "completions/mean_terminated_length": 138.83536682128906, "completions/min_length": 43.4, "completions/min_terminated_length": 43.4, "epoch": 0.22917584839136185, "grad_norm": 1.2078511430760708, "kl": 0.08446044921875, "learning_rate": 1e-06, "loss": 0.1052, "num_tokens": 6480960.0, "reward": 0.44752122163772584, "reward_std": 0.23120047450065612, "rewards/format_reward/mean": 0.87109375, "rewards/format_reward/std": 0.33249542117118835, "rewards/qatch_metrics/mean": 0.3680166721343994, "rewards/qatch_metrics/std": 0.4190321207046509, "rewards/tag_count_reward/mean": 0.951953125, "rewards/tag_count_reward/std": 0.14520585983991624, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 2009.2, "completions/max_terminated_length": 904.6, "completions/mean_length": 152.75390625, "completions/mean_terminated_length": 146.58113708496094, "completions/min_length": 44.2, "completions/min_terminated_length": 44.2, "epoch": 0.2379903040987219, "grad_norm": 0.9663165749537755, "kl": 0.084228515625, "learning_rate": 1e-06, "loss": 0.0976, "num_tokens": 7141781.0, "reward": 0.4069031774997711, "reward_std": 0.24234023094177246, "rewards/format_reward/mean": 0.86015625, "rewards/format_reward/std": 0.34610814452171323, "rewards/qatch_metrics/mean": 0.3220804750919342, "rewards/qatch_metrics/std": 0.4035941183567047, "rewards/tag_count_reward/mean": 0.9423828125, "rewards/tag_count_reward/std": 0.16873225271701814, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1271.4, "completions/max_terminated_length": 543.8, "completions/mean_length": 142.09921875, "completions/mean_terminated_length": 139.02362060546875, "completions/min_length": 46.4, "completions/min_terminated_length": 46.4, "epoch": 0.24680475980608196, "grad_norm": 1.120567941307361, "kl": 0.0905029296875, "learning_rate": 1e-06, "loss": 0.0802, "num_tokens": 7791572.0, "reward": 0.44534188508987427, "reward_std": 0.24042359590530396, "rewards/format_reward/mean": 0.878125, "rewards/format_reward/std": 0.3254675090312958, "rewards/qatch_metrics/mean": 0.36461407542228697, "rewards/qatch_metrics/std": 0.42095342874526975, "rewards/tag_count_reward/mean": 0.9521484375, "rewards/tag_count_reward/std": 0.15115214437246322, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1145.2, "completions/max_terminated_length": 412.0, "completions/mean_length": 134.07890625, "completions/mean_terminated_length": 130.97618713378907, "completions/min_length": 40.8, "completions/min_terminated_length": 40.8, "epoch": 0.255619215513442, "grad_norm": 1.2960180615344776, "kl": 0.090283203125, "learning_rate": 1e-06, "loss": 0.0413, "num_tokens": 8406681.0, "reward": 0.4552301824092865, "reward_std": 0.238674333691597, "rewards/format_reward/mean": 0.9109375, "rewards/format_reward/std": 0.280667769908905, "rewards/qatch_metrics/mean": 0.3717781364917755, "rewards/qatch_metrics/std": 0.4111446261405945, "rewards/tag_count_reward/mean": 0.9625, "rewards/tag_count_reward/std": 0.13774650245904924, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1219.0, "completions/max_terminated_length": 542.6, "completions/mean_length": 137.4578125, "completions/mean_terminated_length": 134.36369934082032, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 0.26443367122080214, "grad_norm": 1.1814014331114655, "kl": 0.091650390625, "learning_rate": 1e-06, "loss": 0.0474, "num_tokens": 9060515.0, "reward": 0.4308152377605438, "reward_std": 0.25072828829288485, "rewards/format_reward/mean": 0.92265625, "rewards/format_reward/std": 0.2654747039079666, "rewards/qatch_metrics/mean": 0.3416989743709564, "rewards/qatch_metrics/std": 0.4145464479923248, "rewards/tag_count_reward/mean": 0.962109375, "rewards/tag_count_reward/std": 0.14527225494384766, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 3343.2, "completions/max_terminated_length": 465.0, "completions/mean_length": 146.34140625, "completions/mean_terminated_length": 133.97686157226562, "completions/min_length": 39.6, "completions/min_terminated_length": 39.6, "epoch": 0.2732481269281622, "grad_norm": 1.119412659054034, "kl": 0.0925048828125, "learning_rate": 1e-06, "loss": 0.108, "num_tokens": 9726008.0, "reward": 0.4162511765956879, "reward_std": 0.22437838315963746, "rewards/format_reward/mean": 0.93671875, "rewards/format_reward/std": 0.2371742010116577, "rewards/qatch_metrics/mean": 0.3222440242767334, "rewards/qatch_metrics/std": 0.3945153594017029, "rewards/tag_count_reward/mean": 0.9734375, "rewards/tag_count_reward/std": 0.11057026386260986, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.8, "completions/max_terminated_length": 494.8, "completions/mean_length": 125.86171875, "completions/mean_terminated_length": 125.86171875, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.28206258263552225, "grad_norm": 1.0577575352944335, "kl": 0.110693359375, "learning_rate": 1e-06, "loss": 0.0378, "num_tokens": 10365223.0, "reward": 0.49046963453292847, "reward_std": 0.22210898101329804, "rewards/format_reward/mean": 0.95234375, "rewards/format_reward/std": 0.21217795908451081, "rewards/qatch_metrics/mean": 0.40703229904174804, "rewards/qatch_metrics/std": 0.4201949179172516, "rewards/tag_count_reward/mean": 0.98515625, "rewards/tag_count_reward/std": 0.07672805488109588, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1195.2, "completions/max_terminated_length": 445.8, "completions/mean_length": 134.734375, "completions/mean_terminated_length": 131.63231811523437, "completions/min_length": 39.2, "completions/min_terminated_length": 39.2, "epoch": 0.2908770383428823, "grad_norm": 1.1665219069490207, "kl": 0.0982666015625, "learning_rate": 1e-06, "loss": 0.0211, "num_tokens": 10984403.0, "reward": 0.47783067226409914, "reward_std": 0.2358974426984787, "rewards/format_reward/mean": 0.96015625, "rewards/format_reward/std": 0.1951357364654541, "rewards/qatch_metrics/mean": 0.3911289095878601, "rewards/qatch_metrics/std": 0.4188136160373688, "rewards/tag_count_reward/mean": 0.987109375, "rewards/tag_count_reward/std": 0.06311970800161362, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1121.6, "completions/max_terminated_length": 376.6, "completions/mean_length": 148.51640625, "completions/mean_terminated_length": 145.42068176269532, "completions/min_length": 42.6, "completions/min_terminated_length": 42.6, "epoch": 0.2996914940502424, "grad_norm": 1.217339460476114, "kl": 0.0956298828125, "learning_rate": 1e-06, "loss": 0.046, "num_tokens": 11677144.0, "reward": 0.446321702003479, "reward_std": 0.23696185946464537, "rewards/format_reward/mean": 0.925, "rewards/format_reward/std": 0.2625602900981903, "rewards/qatch_metrics/mean": 0.35908021926879885, "rewards/qatch_metrics/std": 0.3949739336967468, "rewards/tag_count_reward/mean": 0.9720703125, "rewards/tag_count_reward/std": 0.1141625314950943, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1986.6, "completions/max_terminated_length": 548.0, "completions/mean_length": 159.27265625, "completions/mean_terminated_length": 153.1066864013672, "completions/min_length": 40.2, "completions/min_terminated_length": 40.2, "epoch": 0.3085059497576025, "grad_norm": 1.1604091154811884, "kl": 0.0948486328125, "learning_rate": 1e-06, "loss": 0.0555, "num_tokens": 12347509.0, "reward": 0.43740702271461485, "reward_std": 0.2165643662214279, "rewards/format_reward/mean": 0.8984375, "rewards/format_reward/std": 0.30079524517059325, "rewards/qatch_metrics/mean": 0.35224584937095643, "rewards/qatch_metrics/std": 0.4033379018306732, "rewards/tag_count_reward/mean": 0.9630859375, "rewards/tag_count_reward/std": 0.12287088185548782, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 1919.6, "completions/max_terminated_length": 460.2, "completions/mean_length": 161.9671875, "completions/mean_terminated_length": 152.73724365234375, "completions/min_length": 41.4, "completions/min_terminated_length": 41.4, "epoch": 0.31732040546496254, "grad_norm": 1.016812518785413, "kl": 0.0965087890625, "learning_rate": 1e-06, "loss": 0.0535, "num_tokens": 13026443.0, "reward": 0.49172326922416687, "reward_std": 0.2285678654909134, "rewards/format_reward/mean": 0.903125, "rewards/format_reward/std": 0.2938369959592819, "rewards/qatch_metrics/mean": 0.41588308215141295, "rewards/qatch_metrics/std": 0.43461284041404724, "rewards/tag_count_reward/mean": 0.958203125, "rewards/tag_count_reward/std": 0.13759158551692963, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1250.8, "completions/max_terminated_length": 516.6, "completions/mean_length": 153.14375, "completions/mean_terminated_length": 150.06268920898438, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.3261348611723226, "grad_norm": 2.943551495922741, "kl": 0.1255859375, "learning_rate": 1e-06, "loss": 0.0417, "num_tokens": 13684611.0, "reward": 0.4662940502166748, "reward_std": 0.22654231786727905, "rewards/format_reward/mean": 0.909375, "rewards/format_reward/std": 0.2848878413438797, "rewards/qatch_metrics/mean": 0.3857020795345306, "rewards/qatch_metrics/std": 0.4162748992443085, "rewards/tag_count_reward/mean": 0.9501953125, "rewards/tag_count_reward/std": 0.1399885058403015, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2653.0, "completions/max_terminated_length": 786.2, "completions/mean_length": 160.4765625, "completions/mean_terminated_length": 148.1515380859375, "completions/min_length": 47.6, "completions/min_terminated_length": 47.6, "epoch": 0.33494931687968266, "grad_norm": 1.1104622942084277, "kl": 0.0940185546875, "learning_rate": 1e-06, "loss": 0.0702, "num_tokens": 14338485.0, "reward": 0.5249280750751495, "reward_std": 0.22171878814697266, "rewards/format_reward/mean": 0.89765625, "rewards/format_reward/std": 0.3031489491462708, "rewards/qatch_metrics/mean": 0.45655598640441897, "rewards/qatch_metrics/std": 0.43402122855186465, "rewards/tag_count_reward/mean": 0.941796875, "rewards/tag_count_reward/std": 0.16171995401382447, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1142.6, "completions/max_terminated_length": 420.2, "completions/mean_length": 142.35078125, "completions/mean_terminated_length": 139.25772094726562, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.34376377258704277, "grad_norm": 1.0499982391674234, "kl": 0.1005126953125, "learning_rate": 1e-06, "loss": 0.0386, "num_tokens": 14972214.0, "reward": 0.46582343578338625, "reward_std": 0.22475437819957733, "rewards/format_reward/mean": 0.93671875, "rewards/format_reward/std": 0.2403053015470505, "rewards/qatch_metrics/mean": 0.3805643320083618, "rewards/qatch_metrics/std": 0.4070888340473175, "rewards/tag_count_reward/mean": 0.9734375, "rewards/tag_count_reward/std": 0.11377856433391571, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 2080.4, "completions/max_terminated_length": 718.8, "completions/mean_length": 147.2296875, "completions/mean_terminated_length": 141.05357666015624, "completions/min_length": 41.2, "completions/min_terminated_length": 41.2, "epoch": 0.35257822829440283, "grad_norm": 1.0220429404852622, "kl": 0.1034912109375, "learning_rate": 1e-06, "loss": 0.0575, "num_tokens": 15639820.0, "reward": 0.4750072777271271, "reward_std": 0.2135873943567276, "rewards/format_reward/mean": 0.92421875, "rewards/format_reward/std": 0.26192537546157835, "rewards/qatch_metrics/mean": 0.39318412244319917, "rewards/qatch_metrics/std": 0.39594073295593263, "rewards/tag_count_reward/mean": 0.967578125, "rewards/tag_count_reward/std": 0.1268332213163376, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1109.2, "completions/max_terminated_length": 358.6, "completions/mean_length": 139.92421875, "completions/mean_terminated_length": 133.72031860351564, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.3613926840017629, "grad_norm": 1.1782501720519973, "kl": 0.1066162109375, "learning_rate": 1e-06, "loss": 0.0398, "num_tokens": 16303995.0, "reward": 0.4705925226211548, "reward_std": 0.18503921926021577, "rewards/format_reward/mean": 0.95, "rewards/format_reward/std": 0.21313293874263764, "rewards/qatch_metrics/mean": 0.3844171941280365, "rewards/qatch_metrics/std": 0.37895620465278623, "rewards/tag_count_reward/mean": 0.9767578125, "rewards/tag_count_reward/std": 0.10622683316469192, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1162.8, "completions/max_terminated_length": 442.2, "completions/mean_length": 137.20234375, "completions/mean_terminated_length": 134.1083190917969, "completions/min_length": 42.8, "completions/min_terminated_length": 42.8, "epoch": 0.37020713970912295, "grad_norm": 1.08337248687343, "kl": 0.103369140625, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 16916398.0, "reward": 0.5206815063953399, "reward_std": 0.23496688902378082, "rewards/format_reward/mean": 0.9484375, "rewards/format_reward/std": 0.22106002569198607, "rewards/qatch_metrics/mean": 0.44340285658836365, "rewards/qatch_metrics/std": 0.4299581289291382, "rewards/tag_count_reward/mean": 0.97890625, "rewards/tag_count_reward/std": 0.0956751674413681, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2602.8, "completions/max_terminated_length": 365.2, "completions/mean_length": 152.73984375, "completions/mean_terminated_length": 140.39059753417968, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.37902159541648306, "grad_norm": 1.0746862969416158, "kl": 0.1025390625, "learning_rate": 1e-06, "loss": 0.0729, "num_tokens": 17589905.0, "reward": 0.4809570789337158, "reward_std": 0.2217806786298752, "rewards/format_reward/mean": 0.95, "rewards/format_reward/std": 0.2149397164583206, "rewards/qatch_metrics/mean": 0.3963695228099823, "rewards/qatch_metrics/std": 0.4306588113307953, "rewards/tag_count_reward/mean": 0.980859375, "rewards/tag_count_reward/std": 0.09641121476888656, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 2013.2, "completions/max_terminated_length": 590.6, "completions/mean_length": 151.1234375, "completions/mean_terminated_length": 144.9534942626953, "completions/min_length": 43.6, "completions/min_terminated_length": 43.6, "epoch": 0.3878360511238431, "grad_norm": 1.1093719128866055, "kl": 0.0995849609375, "learning_rate": 1e-06, "loss": 0.0538, "num_tokens": 18226287.0, "reward": 0.5117225289344788, "reward_std": 0.23414760828018188, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.2397076427936554, "rewards/qatch_metrics/mean": 0.43437943458557127, "rewards/qatch_metrics/std": 0.4301003873348236, "rewards/tag_count_reward/mean": 0.975, "rewards/tag_count_reward/std": 0.10860425382852554, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1985.0, "completions/max_terminated_length": 565.6, "completions/mean_length": 158.75078125, "completions/mean_terminated_length": 152.5970947265625, "completions/min_length": 43.6, "completions/min_terminated_length": 43.6, "epoch": 0.3966505068312032, "grad_norm": 1.0904499601554711, "kl": 0.1022705078125, "learning_rate": 1e-06, "loss": 0.0853, "num_tokens": 18930000.0, "reward": 0.5386084854602814, "reward_std": 0.19289222061634065, "rewards/format_reward/mean": 0.9125, "rewards/format_reward/std": 0.2818691849708557, "rewards/qatch_metrics/mean": 0.46959454417228697, "rewards/qatch_metrics/std": 0.42910557985305786, "rewards/tag_count_reward/mean": 0.9640625, "rewards/tag_count_reward/std": 0.13219460248947143, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1952.6, "completions/max_terminated_length": 454.4, "completions/mean_length": 162.29296875, "completions/mean_terminated_length": 156.1261993408203, "completions/min_length": 52.4, "completions/min_terminated_length": 52.4, "epoch": 0.40546496253856323, "grad_norm": 1.0851594244493181, "kl": 0.1014404296875, "learning_rate": 1e-06, "loss": 0.0638, "num_tokens": 19599671.0, "reward": 0.5117276430130004, "reward_std": 0.24680890440940856, "rewards/format_reward/mean": 0.8921875, "rewards/format_reward/std": 0.30913242101669314, "rewards/qatch_metrics/mean": 0.4411179721355438, "rewards/qatch_metrics/std": 0.43982199430465696, "rewards/tag_count_reward/mean": 0.951171875, "rewards/tag_count_reward/std": 0.15551512241363524, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1092.6, "completions/max_terminated_length": 378.0, "completions/mean_length": 151.4390625, "completions/mean_terminated_length": 148.35447387695314, "completions/min_length": 42.8, "completions/min_terminated_length": 42.8, "epoch": 0.4142794182459233, "grad_norm": 0.9747077221272922, "kl": 0.1077880859375, "learning_rate": 1e-06, "loss": 0.0322, "num_tokens": 20243993.0, "reward": 0.5243084728717804, "reward_std": 0.23080018162727356, "rewards/format_reward/mean": 0.91640625, "rewards/format_reward/std": 0.2711502879858017, "rewards/qatch_metrics/mean": 0.4524148523807526, "rewards/qatch_metrics/std": 0.4158477485179901, "rewards/tag_count_reward/mean": 0.9623046875, "rewards/tag_count_reward/std": 0.1330309897661209, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1154.2, "completions/max_terminated_length": 407.8, "completions/mean_length": 149.76953125, "completions/mean_terminated_length": 146.6907531738281, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.4230938739532834, "grad_norm": 0.9985959553205879, "kl": 0.1137451171875, "learning_rate": 1e-06, "loss": 0.0347, "num_tokens": 20888562.0, "reward": 0.45308218002319334, "reward_std": 0.21805870532989502, "rewards/format_reward/mean": 0.92109375, "rewards/format_reward/std": 0.2692244678735733, "rewards/qatch_metrics/mean": 0.36815963983535765, "rewards/qatch_metrics/std": 0.4063821077346802, "rewards/tag_count_reward/mean": 0.9607421875, "rewards/tag_count_reward/std": 0.14251872897148132, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1154.8, "completions/max_terminated_length": 422.6, "completions/mean_length": 141.10703125, "completions/mean_terminated_length": 138.00346984863282, "completions/min_length": 42.4, "completions/min_terminated_length": 42.4, "epoch": 0.43190832966064346, "grad_norm": 1.206317234569705, "kl": 0.1264892578125, "learning_rate": 1e-06, "loss": 0.033, "num_tokens": 21529883.0, "reward": 0.5218020260334015, "reward_std": 0.21800511479377746, "rewards/format_reward/mean": 0.94296875, "rewards/format_reward/std": 0.23152050971984864, "rewards/qatch_metrics/mean": 0.44588152766227723, "rewards/qatch_metrics/std": 0.41684806942939756, "rewards/tag_count_reward/mean": 0.9701171875, "rewards/tag_count_reward/std": 0.12424642890691757, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 1911.8, "completions/max_terminated_length": 458.6, "completions/mean_length": 154.50234375, "completions/mean_terminated_length": 145.24712524414062, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.4407227853680035, "grad_norm": 1.0646070893278259, "kl": 0.108837890625, "learning_rate": 1e-06, "loss": 0.0425, "num_tokens": 22202974.0, "reward": 0.5250638484954834, "reward_std": 0.23499601781368257, "rewards/format_reward/mean": 0.92578125, "rewards/format_reward/std": 0.26019937098026275, "rewards/qatch_metrics/mean": 0.4519937574863434, "rewards/qatch_metrics/std": 0.43260250687599183, "rewards/tag_count_reward/mean": 0.9658203125, "rewards/tag_count_reward/std": 0.13058245778083802, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1164.6, "completions/max_terminated_length": 436.8, "completions/mean_length": 149.890625, "completions/mean_terminated_length": 146.80228271484376, "completions/min_length": 50.8, "completions/min_terminated_length": 50.8, "epoch": 0.4495372410753636, "grad_norm": 1.0628232924013388, "kl": 0.1126220703125, "learning_rate": 1e-06, "loss": 0.0354, "num_tokens": 22890018.0, "reward": 0.49608793258666994, "reward_std": 0.24011301696300508, "rewards/format_reward/mean": 0.91640625, "rewards/format_reward/std": 0.2767932593822479, "rewards/qatch_metrics/mean": 0.4192716181278229, "rewards/qatch_metrics/std": 0.4066218316555023, "rewards/tag_count_reward/mean": 0.961328125, "rewards/tag_count_reward/std": 0.1395171895623207, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.8, "completions/max_terminated_length": 716.8, "completions/mean_length": 156.6890625, "completions/mean_terminated_length": 156.6890625, "completions/min_length": 47.4, "completions/min_terminated_length": 47.4, "epoch": 0.4583516967827237, "grad_norm": 1.0620716307535578, "kl": 0.1146728515625, "learning_rate": 1e-06, "loss": 0.0473, "num_tokens": 23537540.0, "reward": 0.46006324887275696, "reward_std": 0.20722155570983886, "rewards/format_reward/mean": 0.9140625, "rewards/format_reward/std": 0.2800079345703125, "rewards/qatch_metrics/mean": 0.3774526119232178, "rewards/qatch_metrics/std": 0.40044850707054136, "rewards/tag_count_reward/mean": 0.9564453125, "rewards/tag_count_reward/std": 0.15245147049427032, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.2, "completions/max_terminated_length": 556.2, "completions/mean_length": 143.2, "completions/mean_terminated_length": 143.2, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.46716615249008375, "grad_norm": 1.0417871765342246, "kl": 0.120166015625, "learning_rate": 1e-06, "loss": 0.0284, "num_tokens": 24185028.0, "reward": 0.6036619067192077, "reward_std": 0.20974452793598175, "rewards/format_reward/mean": 0.9546875, "rewards/format_reward/std": 0.20255258679389954, "rewards/qatch_metrics/mean": 0.5404294610023499, "rewards/qatch_metrics/std": 0.4255226194858551, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11209065765142441, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1263.4, "completions/max_terminated_length": 545.8, "completions/mean_length": 153.36640625, "completions/mean_terminated_length": 150.29201049804686, "completions/min_length": 48.2, "completions/min_terminated_length": 48.2, "epoch": 0.4759806081974438, "grad_norm": 1.2230142515821079, "kl": 0.12080078125, "learning_rate": 1e-06, "loss": 0.0454, "num_tokens": 24848617.0, "reward": 0.5000587105751038, "reward_std": 0.19942412078380584, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.2082734227180481, "rewards/qatch_metrics/mean": 0.4188075542449951, "rewards/qatch_metrics/std": 0.4028447926044464, "rewards/tag_count_reward/mean": 0.9751953125, "rewards/tag_count_reward/std": 0.11493908390402793, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 146.3046875, "completions/mean_terminated_length": 146.3046875, "completions/min_length": 45.6, "completions/min_terminated_length": 45.6, "epoch": 0.48479506390480387, "grad_norm": 1.11053365840032, "kl": 0.1185302734375, "learning_rate": 1e-06, "loss": 0.0403, "num_tokens": 25530191.0, "reward": 0.5597240447998046, "reward_std": 0.22672632932662964, "rewards/format_reward/mean": 0.96328125, "rewards/format_reward/std": 0.18625771403312683, "rewards/qatch_metrics/mean": 0.4874395847320557, "rewards/qatch_metrics/std": 0.41059340834617614, "rewards/tag_count_reward/mean": 0.9814453125, "rewards/tag_count_reward/std": 0.09651189893484116, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1201.8, "completions/max_terminated_length": 482.4, "completions/mean_length": 142.7421875, "completions/mean_terminated_length": 139.64728698730468, "completions/min_length": 45.2, "completions/min_terminated_length": 45.2, "epoch": 0.4936095196121639, "grad_norm": 1.0811085458763714, "kl": 0.126318359375, "learning_rate": 1e-06, "loss": 0.0566, "num_tokens": 26189061.0, "reward": 0.4923192024230957, "reward_std": 0.197740375995636, "rewards/format_reward/mean": 0.9578125, "rewards/format_reward/std": 0.1986761748790741, "rewards/qatch_metrics/mean": 0.40876016914844515, "rewards/qatch_metrics/std": 0.3987727761268616, "rewards/tag_count_reward/mean": 0.9818359375, "rewards/tag_count_reward/std": 0.08791020289063453, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1150.6, "completions/max_terminated_length": 1150.6, "completions/mean_length": 149.32734375, "completions/mean_terminated_length": 149.32734375, "completions/min_length": 43.6, "completions/min_terminated_length": 43.6, "epoch": 0.502423975319524, "grad_norm": 1.061030164323382, "kl": 0.1281982421875, "learning_rate": 1e-06, "loss": 0.0294, "num_tokens": 26873144.0, "reward": 0.47796512842178346, "reward_std": 0.21353891789913176, "rewards/format_reward/mean": 0.95, "rewards/format_reward/std": 0.21780532896518706, "rewards/qatch_metrics/mean": 0.392907041311264, "rewards/qatch_metrics/std": 0.4109325408935547, "rewards/tag_count_reward/mean": 0.9798828125, "rewards/tag_count_reward/std": 0.09564688950777053, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1897.2, "completions/max_terminated_length": 495.2, "completions/mean_length": 149.51796875, "completions/mean_terminated_length": 143.3479248046875, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.511238431026884, "grad_norm": 1.0941932001692303, "kl": 0.1298095703125, "learning_rate": 1e-06, "loss": 0.0626, "num_tokens": 27539071.0, "reward": 0.4637163817882538, "reward_std": 0.18355560302734375, "rewards/format_reward/mean": 0.92109375, "rewards/format_reward/std": 0.2646732360124588, "rewards/qatch_metrics/mean": 0.38052110075950624, "rewards/qatch_metrics/std": 0.3895488500595093, "rewards/tag_count_reward/mean": 0.96328125, "rewards/tag_count_reward/std": 0.1275038242340088, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.2, "completions/max_terminated_length": 840.2, "completions/mean_length": 146.7, "completions/mean_terminated_length": 146.7, "completions/min_length": 44.2, "completions/min_terminated_length": 44.2, "epoch": 0.5200528867342442, "grad_norm": 1.156974504094534, "kl": 0.127880859375, "learning_rate": 1e-06, "loss": 0.0401, "num_tokens": 28189663.0, "reward": 0.4947424054145813, "reward_std": 0.2121095508337021, "rewards/format_reward/mean": 0.890625, "rewards/format_reward/std": 0.308673033118248, "rewards/qatch_metrics/mean": 0.42144557237625124, "rewards/qatch_metrics/std": 0.4137202322483063, "rewards/tag_count_reward/mean": 0.9490234375, "rewards/tag_count_reward/std": 0.15444399118423463, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1255.6, "completions/max_terminated_length": 607.6, "completions/mean_length": 160.95703125, "completions/mean_terminated_length": 157.87724609375, "completions/min_length": 46.4, "completions/min_terminated_length": 46.4, "epoch": 0.5288673424416043, "grad_norm": 1.0193761202428142, "kl": 0.1197265625, "learning_rate": 1e-06, "loss": 0.045, "num_tokens": 28883656.0, "reward": 0.5260525703430176, "reward_std": 0.21179039478302003, "rewards/format_reward/mean": 0.9359375, "rewards/format_reward/std": 0.24322082698345185, "rewards/qatch_metrics/mean": 0.4514336109161377, "rewards/qatch_metrics/std": 0.42365469336509703, "rewards/tag_count_reward/mean": 0.9748046875, "rewards/tag_count_reward/std": 0.10406171679496765, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 2045.6, "completions/max_terminated_length": 651.8, "completions/mean_length": 167.04921875, "completions/mean_terminated_length": 160.9025909423828, "completions/min_length": 47.4, "completions/min_terminated_length": 47.4, "epoch": 0.5376817981489643, "grad_norm": 1.0505969033833242, "kl": 0.1183837890625, "learning_rate": 1e-06, "loss": 0.074, "num_tokens": 29572327.0, "reward": 0.5555553436279297, "reward_std": 0.23490612506866454, "rewards/format_reward/mean": 0.9421875, "rewards/format_reward/std": 0.23279777467250823, "rewards/qatch_metrics/mean": 0.48544191718101504, "rewards/qatch_metrics/std": 0.4043150365352631, "rewards/tag_count_reward/mean": 0.97421875, "rewards/tag_count_reward/std": 0.11451640278100968, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 1991.8, "completions/max_terminated_length": 547.4, "completions/mean_length": 163.9890625, "completions/mean_terminated_length": 154.75635986328126, "completions/min_length": 48.6, "completions/min_terminated_length": 48.6, "epoch": 0.5464962538563244, "grad_norm": 1.0184747818610653, "kl": 0.1130615234375, "learning_rate": 1e-06, "loss": 0.0564, "num_tokens": 30237305.0, "reward": 0.5232127249240875, "reward_std": 0.22367032766342163, "rewards/format_reward/mean": 0.92890625, "rewards/format_reward/std": 0.25724474191665647, "rewards/qatch_metrics/mean": 0.4492989718914032, "rewards/qatch_metrics/std": 0.4237508654594421, "rewards/tag_count_reward/mean": 0.968359375, "rewards/tag_count_reward/std": 0.12655377388000488, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1175.4, "completions/max_terminated_length": 489.4, "completions/mean_length": 165.95546875, "completions/mean_terminated_length": 162.88382873535156, "completions/min_length": 52.8, "completions/min_terminated_length": 52.8, "epoch": 0.5553107095636844, "grad_norm": 0.926469518807395, "kl": 0.116796875, "learning_rate": 1e-06, "loss": 0.034, "num_tokens": 30935440.0, "reward": 0.5955564260482789, "reward_std": 0.22866220772266388, "rewards/format_reward/mean": 0.91328125, "rewards/format_reward/std": 0.28102830052375793, "rewards/qatch_metrics/mean": 0.5368104040622711, "rewards/qatch_metrics/std": 0.4246533751487732, "rewards/tag_count_reward/mean": 0.9587890625, "rewards/tag_count_reward/std": 0.1424473986029625, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 173.453125, "completions/mean_terminated_length": 173.453125, "completions/min_length": 38.4, "completions/min_terminated_length": 38.4, "epoch": 0.5641251652710445, "grad_norm": 1.013203860912236, "kl": 0.1189208984375, "learning_rate": 1e-06, "loss": 0.0382, "num_tokens": 31617748.0, "reward": 0.5389631450176239, "reward_std": 0.21291258931159973, "rewards/format_reward/mean": 0.90859375, "rewards/format_reward/std": 0.28675017356872556, "rewards/qatch_metrics/mean": 0.47106875777244567, "rewards/qatch_metrics/std": 0.42714625000953677, "rewards/tag_count_reward/mean": 0.95390625, "rewards/tag_count_reward/std": 0.16191803216934203, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2694.6, "completions/max_terminated_length": 553.8, "completions/mean_length": 184.3796875, "completions/mean_terminated_length": 172.1176513671875, "completions/min_length": 44.8, "completions/min_terminated_length": 44.8, "epoch": 0.5729396209784046, "grad_norm": 1.0308637085090815, "kl": 0.1224853515625, "learning_rate": 1e-06, "loss": 0.0671, "num_tokens": 32294954.0, "reward": 0.5857669234275817, "reward_std": 0.215561243891716, "rewards/format_reward/mean": 0.9359375, "rewards/format_reward/std": 0.24411277770996093, "rewards/qatch_metrics/mean": 0.5224210977554321, "rewards/qatch_metrics/std": 0.4328895270824432, "rewards/tag_count_reward/mean": 0.9623046875, "rewards/tag_count_reward/std": 0.15089936107397078, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.4, "completions/max_terminated_length": 496.4, "completions/mean_length": 158.73125, "completions/mean_terminated_length": 158.73125, "completions/min_length": 32.0, "completions/min_terminated_length": 32.0, "epoch": 0.5817540766857646, "grad_norm": 13.29499090198915, "kl": 0.1685302734375, "learning_rate": 1e-06, "loss": 0.0302, "num_tokens": 32972034.0, "reward": 0.5693387031555176, "reward_std": 0.20702467262744903, "rewards/format_reward/mean": 0.91640625, "rewards/format_reward/std": 0.2766654253005981, "rewards/qatch_metrics/mean": 0.5060119867324829, "rewards/qatch_metrics/std": 0.4102466404438019, "rewards/tag_count_reward/mean": 0.9517578125, "rewards/tag_count_reward/std": 0.16461062729358672, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1304.4, "completions/max_terminated_length": 579.8, "completions/mean_length": 160.171875, "completions/mean_terminated_length": 154.0051055908203, "completions/min_length": 44.2, "completions/min_terminated_length": 44.2, "epoch": 0.5905685323931247, "grad_norm": 0.935442580551547, "kl": 0.1378662109375, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 33644622.0, "reward": 0.5727396726608276, "reward_std": 0.21638197600841522, "rewards/format_reward/mean": 0.8984375, "rewards/format_reward/std": 0.3004028916358948, "rewards/qatch_metrics/mean": 0.5123223960399628, "rewards/qatch_metrics/std": 0.43056052923202515, "rewards/tag_count_reward/mean": 0.9484375, "rewards/tag_count_reward/std": 0.15865270793437958, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1336.4, "completions/max_terminated_length": 600.0, "completions/mean_length": 169.71640625, "completions/mean_terminated_length": 163.57119750976562, "completions/min_length": 29.2, "completions/min_terminated_length": 29.2, "epoch": 0.5993829881004848, "grad_norm": 1.4959634071669228, "kl": 0.1254638671875, "learning_rate": 1e-06, "loss": 0.0391, "num_tokens": 34325763.0, "reward": 0.5609373271465301, "reward_std": 0.22974819540977479, "rewards/format_reward/mean": 0.88046875, "rewards/format_reward/std": 0.3236552834510803, "rewards/qatch_metrics/mean": 0.5021367311477661, "rewards/qatch_metrics/std": 0.4220038175582886, "rewards/tag_count_reward/mean": 0.921484375, "rewards/tag_count_reward/std": 0.21414475739002228, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2653.0, "completions/max_terminated_length": 500.6, "completions/mean_length": 174.47734375, "completions/mean_terminated_length": 159.10546264648437, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.6081974438078449, "grad_norm": 0.8936813027459303, "kl": 0.1334228515625, "learning_rate": 1e-06, "loss": 0.068, "num_tokens": 35041510.0, "reward": 0.5479878842830658, "reward_std": 0.24380851686000823, "rewards/format_reward/mean": 0.915625, "rewards/format_reward/std": 0.2778396010398865, "rewards/qatch_metrics/mean": 0.4811346590518951, "rewards/qatch_metrics/std": 0.43007351756095885, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.17109024226665498, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.4, "completions/max_terminated_length": 997.4, "completions/mean_length": 170.0734375, "completions/mean_terminated_length": 170.0734375, "completions/min_length": 35.0, "completions/min_terminated_length": 35.0, "epoch": 0.617011899515205, "grad_norm": 0.8929486085452816, "kl": 0.119775390625, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 35733284.0, "reward": 0.5295659184455872, "reward_std": 0.2090097412467003, "rewards/format_reward/mean": 0.909375, "rewards/format_reward/std": 0.2840398609638214, "rewards/qatch_metrics/mean": 0.46024298667907715, "rewards/qatch_metrics/std": 0.4201698362827301, "rewards/tag_count_reward/mean": 0.9484375, "rewards/tag_count_reward/std": 0.171261465549469, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 2099.4, "completions/max_terminated_length": 687.6, "completions/mean_length": 177.484375, "completions/mean_terminated_length": 171.34949951171876, "completions/min_length": 25.4, "completions/min_terminated_length": 25.4, "epoch": 0.625826355222565, "grad_norm": 0.971309519624497, "kl": 0.11083984375, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 36425760.0, "reward": 0.5487818002700806, "reward_std": 0.23004478216171265, "rewards/format_reward/mean": 0.83671875, "rewards/format_reward/std": 0.3683928668498993, "rewards/qatch_metrics/mean": 0.49449974298477173, "rewards/qatch_metrics/std": 0.4279952645301819, "rewards/tag_count_reward/mean": 0.895703125, "rewards/tag_count_reward/std": 0.2428739696741104, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1983.4, "completions/max_terminated_length": 626.4, "completions/mean_length": 189.1375, "completions/mean_terminated_length": 183.03312072753906, "completions/min_length": 46.6, "completions/min_terminated_length": 46.6, "epoch": 0.6346408109299251, "grad_norm": 1.085714923968051, "kl": 0.111279296875, "learning_rate": 1e-06, "loss": 0.0412, "num_tokens": 37146560.0, "reward": 0.5465562880039215, "reward_std": 0.2106493055820465, "rewards/format_reward/mean": 0.9015625, "rewards/format_reward/std": 0.2943433105945587, "rewards/qatch_metrics/mean": 0.4813575744628906, "rewards/qatch_metrics/std": 0.41973625421524047, "rewards/tag_count_reward/mean": 0.944921875, "rewards/tag_count_reward/std": 0.1730790615081787, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1243.6, "completions/max_terminated_length": 550.0, "completions/mean_length": 177.896875, "completions/mean_terminated_length": 174.8417541503906, "completions/min_length": 26.8, "completions/min_terminated_length": 26.8, "epoch": 0.6434552666372851, "grad_norm": 0.9896824938718284, "kl": 0.115478515625, "learning_rate": 1e-06, "loss": 0.041, "num_tokens": 37832268.0, "reward": 0.5586158275604248, "reward_std": 0.2251075476408005, "rewards/format_reward/mean": 0.87421875, "rewards/format_reward/std": 0.32932343482971194, "rewards/qatch_metrics/mean": 0.49995704293251036, "rewards/qatch_metrics/std": 0.4248849630355835, "rewards/tag_count_reward/mean": 0.924609375, "rewards/tag_count_reward/std": 0.20914700627326965, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1282.6, "completions/max_terminated_length": 555.8, "completions/mean_length": 170.38203125, "completions/mean_terminated_length": 167.313427734375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.6522697223446452, "grad_norm": 0.9278166717820118, "kl": 0.1187255859375, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 38555925.0, "reward": 0.5743588328361511, "reward_std": 0.20262247920036316, "rewards/format_reward/mean": 0.8546875, "rewards/format_reward/std": 0.3492723762989044, "rewards/qatch_metrics/mean": 0.5217640638351441, "rewards/qatch_metrics/std": 0.4110603451728821, "rewards/tag_count_reward/mean": 0.9078125, "rewards/tag_count_reward/std": 0.22944335341453553, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.4, "completions/max_terminated_length": 458.4, "completions/mean_length": 171.5578125, "completions/mean_terminated_length": 171.5578125, "completions/min_length": 33.8, "completions/min_terminated_length": 33.8, "epoch": 0.6610841780520053, "grad_norm": 0.9296512691094065, "kl": 0.1155029296875, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 39234879.0, "reward": 0.6218206763267518, "reward_std": 0.20180206298828124, "rewards/format_reward/mean": 0.90703125, "rewards/format_reward/std": 0.29024410247802734, "rewards/qatch_metrics/mean": 0.5695247530937195, "rewards/qatch_metrics/std": 0.43560155630111697, "rewards/tag_count_reward/mean": 0.9404296875, "rewards/tag_count_reward/std": 0.1924948960542679, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1532.6, "completions/max_terminated_length": 808.2, "completions/mean_length": 177.89375, "completions/mean_terminated_length": 174.8284454345703, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.6698986337593653, "grad_norm": 0.9179048520524131, "kl": 0.1192138671875, "learning_rate": 1e-06, "loss": -0.0033, "num_tokens": 39936711.0, "reward": 0.548683899641037, "reward_std": 0.20842809975147247, "rewards/format_reward/mean": 0.8875, "rewards/format_reward/std": 0.3159508228302002, "rewards/qatch_metrics/mean": 0.48670990467071534, "rewards/qatch_metrics/std": 0.42913843393325807, "rewards/tag_count_reward/mean": 0.924609375, "rewards/tag_count_reward/std": 0.2165108621120453, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1210.4, "completions/max_terminated_length": 476.8, "completions/mean_length": 185.3, "completions/mean_terminated_length": 182.22890930175782, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.6787130894667255, "grad_norm": 1.0609912938864583, "kl": 0.1187255859375, "learning_rate": 1e-06, "loss": 0.0192, "num_tokens": 40628007.0, "reward": 0.5760585784912109, "reward_std": 0.22054702043533325, "rewards/format_reward/mean": 0.88984375, "rewards/format_reward/std": 0.3102767616510391, "rewards/qatch_metrics/mean": 0.5183065176010132, "rewards/qatch_metrics/std": 0.4284651458263397, "rewards/tag_count_reward/mean": 0.9302734375, "rewards/tag_count_reward/std": 0.20196012556552886, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1221.8, "completions/max_terminated_length": 604.6, "completions/mean_length": 170.103125, "completions/mean_terminated_length": 167.0412139892578, "completions/min_length": 28.4, "completions/min_terminated_length": 28.4, "epoch": 0.6875275451740855, "grad_norm": 1.0349341835531938, "kl": 0.1177978515625, "learning_rate": 1e-06, "loss": 0.0142, "num_tokens": 41336699.0, "reward": 0.5688169717788696, "reward_std": 0.21753813624382018, "rewards/format_reward/mean": 0.8546875, "rewards/format_reward/std": 0.35243783593177797, "rewards/qatch_metrics/mean": 0.5149684965610504, "rewards/qatch_metrics/std": 0.4279025971889496, "rewards/tag_count_reward/mean": 0.9125, "rewards/tag_count_reward/std": 0.22199150621891023, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1952.2, "completions/max_terminated_length": 1016.6, "completions/mean_length": 194.11171875, "completions/mean_terminated_length": 188.0023986816406, "completions/min_length": 36.4, "completions/min_terminated_length": 36.4, "epoch": 0.6963420008814456, "grad_norm": 1.1198091080655637, "kl": 0.1160400390625, "learning_rate": 1e-06, "loss": 0.0349, "num_tokens": 42057866.0, "reward": 0.5434407353401184, "reward_std": 0.2424723982810974, "rewards/format_reward/mean": 0.8625, "rewards/format_reward/std": 0.3433255970478058, "rewards/qatch_metrics/mean": 0.48393073081970217, "rewards/qatch_metrics/std": 0.4233227550983429, "rewards/tag_count_reward/mean": 0.9169921875, "rewards/tag_count_reward/std": 0.21575720310211183, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1565.8, "completions/max_terminated_length": 859.6, "completions/mean_length": 195.30625, "completions/mean_terminated_length": 192.24614868164062, "completions/min_length": 36.6, "completions/min_terminated_length": 36.6, "epoch": 0.7051564565888057, "grad_norm": 0.9364233280028263, "kl": 0.109765625, "learning_rate": 1e-06, "loss": -0.009, "num_tokens": 42802098.0, "reward": 0.532480639219284, "reward_std": 0.20860818028450012, "rewards/format_reward/mean": 0.8578125, "rewards/format_reward/std": 0.3485052168369293, "rewards/qatch_metrics/mean": 0.4721968710422516, "rewards/qatch_metrics/std": 0.4088200509548187, "rewards/tag_count_reward/mean": 0.906640625, "rewards/tag_count_reward/std": 0.23517801761627197, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1322.0, "completions/max_terminated_length": 674.6, "completions/mean_length": 191.78125, "completions/mean_terminated_length": 188.7188934326172, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.7139709122961657, "grad_norm": 0.9184749770599845, "kl": 0.109228515625, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 43511370.0, "reward": 0.6411563873291015, "reward_std": 0.206281441450119, "rewards/format_reward/mean": 0.89609375, "rewards/format_reward/std": 0.30500052571296693, "rewards/qatch_metrics/mean": 0.5939271092414856, "rewards/qatch_metrics/std": 0.4107288718223572, "rewards/tag_count_reward/mean": 0.9341796875, "rewards/tag_count_reward/std": 0.2010919064283371, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 2370.0, "completions/max_terminated_length": 1083.0, "completions/mean_length": 203.7921875, "completions/mean_terminated_length": 197.71029968261718, "completions/min_length": 41.8, "completions/min_terminated_length": 41.8, "epoch": 0.7227853680035258, "grad_norm": 0.9498324772801405, "kl": 0.112841796875, "learning_rate": 1e-06, "loss": 0.023, "num_tokens": 44257808.0, "reward": 0.6159097194671631, "reward_std": 0.18089311718940734, "rewards/format_reward/mean": 0.88828125, "rewards/format_reward/std": 0.31010690331459045, "rewards/qatch_metrics/mean": 0.5651557564735412, "rewards/qatch_metrics/std": 0.407144832611084, "rewards/tag_count_reward/mean": 0.933984375, "rewards/tag_count_reward/std": 0.19542383253574372, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 1977.8, "completions/max_terminated_length": 646.4, "completions/mean_length": 204.78828125, "completions/mean_terminated_length": 192.57474365234376, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.7315998237108858, "grad_norm": 0.8990363515461627, "kl": 0.108154296875, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 44986913.0, "reward": 0.5512337327003479, "reward_std": 0.20042451322078705, "rewards/format_reward/mean": 0.84296875, "rewards/format_reward/std": 0.3632165014743805, "rewards/qatch_metrics/mean": 0.4962239682674408, "rewards/qatch_metrics/std": 0.4146161139011383, "rewards/tag_count_reward/mean": 0.9029296875, "rewards/tag_count_reward/std": 0.23338495790958405, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1267.8, "completions/max_terminated_length": 545.2, "completions/mean_length": 207.953125, "completions/mean_terminated_length": 204.91087341308594, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.7404142794182459, "grad_norm": 0.8949487847379094, "kl": 0.1084228515625, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 45704165.0, "reward": 0.5654355883598328, "reward_std": 0.21594917476177217, "rewards/format_reward/mean": 0.85703125, "rewards/format_reward/std": 0.35001330375671386, "rewards/qatch_metrics/mean": 0.5105997562408447, "rewards/qatch_metrics/std": 0.42142562866210936, "rewards/tag_count_reward/mean": 0.914453125, "rewards/tag_count_reward/std": 0.22119783163070678, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1989.4, "completions/max_terminated_length": 602.8, "completions/mean_length": 194.7515625, "completions/mean_terminated_length": 188.6415222167969, "completions/min_length": 25.6, "completions/min_terminated_length": 25.6, "epoch": 0.749228735125606, "grad_norm": 0.8443386346612493, "kl": 0.11494140625, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 46430055.0, "reward": 0.5958282589912415, "reward_std": 0.19520920515060425, "rewards/format_reward/mean": 0.87109375, "rewards/format_reward/std": 0.3312843978404999, "rewards/qatch_metrics/mean": 0.544264841079712, "rewards/qatch_metrics/std": 0.4191899299621582, "rewards/tag_count_reward/mean": 0.921875, "rewards/tag_count_reward/std": 0.21222967505455018, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 2666.0, "completions/max_terminated_length": 567.2, "completions/mean_length": 208.08203125, "completions/mean_terminated_length": 195.87457275390625, "completions/min_length": 32.6, "completions/min_terminated_length": 32.6, "epoch": 0.7580431908329661, "grad_norm": 0.9046123034832724, "kl": 0.1042724609375, "learning_rate": 1e-06, "loss": 0.0387, "num_tokens": 47156176.0, "reward": 0.6103429317474365, "reward_std": 0.22614607214927673, "rewards/format_reward/mean": 0.86640625, "rewards/format_reward/std": 0.3397656261920929, "rewards/qatch_metrics/mean": 0.5620072841644287, "rewards/qatch_metrics/std": 0.4072328984737396, "rewards/tag_count_reward/mean": 0.919921875, "rewards/tag_count_reward/std": 0.21127038300037385, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1371.4, "completions/max_terminated_length": 650.8, "completions/mean_length": 200.70390625, "completions/mean_terminated_length": 197.6367401123047, "completions/min_length": 30.8, "completions/min_terminated_length": 30.8, "epoch": 0.7668576465403262, "grad_norm": 0.8934328290702316, "kl": 0.11044921875, "learning_rate": 1e-06, "loss": -0.0073, "num_tokens": 47893749.0, "reward": 0.526023668050766, "reward_std": 0.22173346281051637, "rewards/format_reward/mean": 0.89765625, "rewards/format_reward/std": 0.3027026534080505, "rewards/qatch_metrics/mean": 0.457994270324707, "rewards/qatch_metrics/std": 0.4191239416599274, "rewards/tag_count_reward/mean": 0.9392578125, "rewards/tag_count_reward/std": 0.19010738730430604, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1307.0, "completions/max_terminated_length": 619.6, "completions/mean_length": 213.5671875, "completions/mean_terminated_length": 207.5018341064453, "completions/min_length": 33.6, "completions/min_terminated_length": 33.6, "epoch": 0.7756721022476862, "grad_norm": 0.7709350601632311, "kl": 0.1083984375, "learning_rate": 1e-06, "loss": 0.0201, "num_tokens": 48637627.0, "reward": 0.5134056210517883, "reward_std": 0.19459065198898315, "rewards/format_reward/mean": 0.8875, "rewards/format_reward/std": 0.31534498929977417, "rewards/qatch_metrics/mean": 0.44468907117843626, "rewards/qatch_metrics/std": 0.3993754625320435, "rewards/tag_count_reward/mean": 0.9333984375, "rewards/tag_count_reward/std": 0.19436517655849456, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1255.0, "completions/max_terminated_length": 573.8, "completions/mean_length": 213.83359375, "completions/mean_terminated_length": 210.8156005859375, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.7844865579550463, "grad_norm": 0.9730549470078724, "kl": 0.11259765625, "learning_rate": 1e-06, "loss": 0.0194, "num_tokens": 49400534.0, "reward": 0.5392698287963867, "reward_std": 0.21343457698822021, "rewards/format_reward/mean": 0.89375, "rewards/format_reward/std": 0.30757365822792054, "rewards/qatch_metrics/mean": 0.4741064965724945, "rewards/qatch_metrics/std": 0.42416965365409853, "rewards/tag_count_reward/mean": 0.9380859375, "rewards/tag_count_reward/std": 0.18792852461338044, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.8, "completions/max_terminated_length": 657.8, "completions/mean_length": 191.94375, "completions/mean_terminated_length": 191.94375, "completions/min_length": 28.4, "completions/min_terminated_length": 28.4, "epoch": 0.7933010136624064, "grad_norm": 0.8068324955270447, "kl": 0.12177734375, "learning_rate": 1e-06, "loss": -0.007, "num_tokens": 50124014.0, "reward": 0.5979775786399841, "reward_std": 0.18400471210479735, "rewards/format_reward/mean": 0.88125, "rewards/format_reward/std": 0.32406928539276125, "rewards/qatch_metrics/mean": 0.545139092206955, "rewards/qatch_metrics/std": 0.40265028476715087, "rewards/tag_count_reward/mean": 0.9296875, "rewards/tag_count_reward/std": 0.19919731020927428, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1665.4, "completions/max_terminated_length": 989.6, "completions/mean_length": 191.44140625, "completions/mean_terminated_length": 188.39669494628907, "completions/min_length": 26.8, "completions/min_terminated_length": 26.8, "epoch": 0.8021154693697664, "grad_norm": 0.9661639043459677, "kl": 0.120654296875, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 50856579.0, "reward": 0.5794390618801117, "reward_std": 0.2076917439699173, "rewards/format_reward/mean": 0.91953125, "rewards/format_reward/std": 0.27112471759319307, "rewards/qatch_metrics/mean": 0.5176190257072448, "rewards/qatch_metrics/std": 0.41045997142791746, "rewards/tag_count_reward/mean": 0.9501953125, "rewards/tag_count_reward/std": 0.17092148661613465, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.6, "completions/max_terminated_length": 578.6, "completions/mean_length": 185.61171875, "completions/mean_terminated_length": 185.61171875, "completions/min_length": 31.8, "completions/min_terminated_length": 31.8, "epoch": 0.8109299250771265, "grad_norm": 0.859237972466753, "kl": 0.1249755859375, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 51568034.0, "reward": 0.5465836644172668, "reward_std": 0.17585654258728028, "rewards/format_reward/mean": 0.93515625, "rewards/format_reward/std": 0.24650255739688873, "rewards/qatch_metrics/mean": 0.4766333520412445, "rewards/qatch_metrics/std": 0.3998740196228027, "rewards/tag_count_reward/mean": 0.95859375, "rewards/tag_count_reward/std": 0.15964243412017823, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1242.8, "completions/max_terminated_length": 579.6, "completions/mean_length": 193.2140625, "completions/mean_terminated_length": 190.17308959960937, "completions/min_length": 37.8, "completions/min_terminated_length": 37.8, "epoch": 0.8197443807844865, "grad_norm": 0.7384329915764564, "kl": 0.112744140625, "learning_rate": 1e-06, "loss": -0.0108, "num_tokens": 52261716.0, "reward": 0.6089231491088867, "reward_std": 0.18263671100139617, "rewards/format_reward/mean": 0.92421875, "rewards/format_reward/std": 0.26459681391716006, "rewards/qatch_metrics/mean": 0.5516513049602508, "rewards/qatch_metrics/std": 0.4096936106681824, "rewards/tag_count_reward/mean": 0.951953125, "rewards/tag_count_reward/std": 0.17155620753765105, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.8, "completions/max_terminated_length": 591.8, "completions/mean_length": 196.45390625, "completions/mean_terminated_length": 196.45390625, "completions/min_length": 31.2, "completions/min_terminated_length": 31.2, "epoch": 0.8285588364918466, "grad_norm": 1.0025973293270143, "kl": 0.1135009765625, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 52953113.0, "reward": 0.6507824778556823, "reward_std": 0.20129505693912506, "rewards/format_reward/mean": 0.91484375, "rewards/format_reward/std": 0.27930967807769774, "rewards/qatch_metrics/mean": 0.6021958470344544, "rewards/qatch_metrics/std": 0.3972749710083008, "rewards/tag_count_reward/mean": 0.9486328125, "rewards/tag_count_reward/std": 0.1716614156961441, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 213.11171875, "completions/mean_terminated_length": 213.11171875, "completions/min_length": 34.6, "completions/min_terminated_length": 34.6, "epoch": 0.8373732921992068, "grad_norm": 0.8516678257406487, "kl": 0.108935546875, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 53659128.0, "reward": 0.5703884243965149, "reward_std": 0.20974204540252686, "rewards/format_reward/mean": 0.9171875, "rewards/format_reward/std": 0.27448596358299254, "rewards/qatch_metrics/mean": 0.5072354257106781, "rewards/qatch_metrics/std": 0.42126131653785703, "rewards/tag_count_reward/mean": 0.950390625, "rewards/tag_count_reward/std": 0.16825708746910095, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1347.6, "completions/max_terminated_length": 668.8, "completions/mean_length": 221.3375, "completions/mean_terminated_length": 215.31790161132812, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.8461877479065668, "grad_norm": 0.8676387330449279, "kl": 0.1122314453125, "learning_rate": 1e-06, "loss": -0.0145, "num_tokens": 54425080.0, "reward": 0.5916900038719177, "reward_std": 0.20206353664398194, "rewards/format_reward/mean": 0.88203125, "rewards/format_reward/std": 0.3205987274646759, "rewards/qatch_metrics/mean": 0.5378453254699707, "rewards/qatch_metrics/std": 0.41082814931869505, "rewards/tag_count_reward/mean": 0.9263671875, "rewards/tag_count_reward/std": 0.20377787947654724, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1610.6, "completions/max_terminated_length": 923.2, "completions/mean_length": 224.6875, "completions/mean_terminated_length": 221.65450744628907, "completions/min_length": 27.4, "completions/min_terminated_length": 27.4, "epoch": 0.8550022036139269, "grad_norm": 0.7904555962961125, "kl": 0.109765625, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 55227528.0, "reward": 0.5810864806175232, "reward_std": 0.2380138784646988, "rewards/format_reward/mean": 0.853125, "rewards/format_reward/std": 0.3541332304477692, "rewards/qatch_metrics/mean": 0.5300695478916169, "rewards/qatch_metrics/std": 0.4341892719268799, "rewards/tag_count_reward/mean": 0.904296875, "rewards/tag_count_reward/std": 0.23573453426361085, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 1990.4, "completions/max_terminated_length": 623.2, "completions/mean_length": 218.89296875, "completions/mean_terminated_length": 209.80899353027343, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.8638166593212869, "grad_norm": 0.8342126282923776, "kl": 0.106396484375, "learning_rate": 1e-06, "loss": 0.0082, "num_tokens": 55974703.0, "reward": 0.5714076519012451, "reward_std": 0.20491171181201934, "rewards/format_reward/mean": 0.8515625, "rewards/format_reward/std": 0.35569257140159605, "rewards/qatch_metrics/mean": 0.5185333371162415, "rewards/qatch_metrics/std": 0.41171206831932067, "rewards/tag_count_reward/mean": 0.9099609375, "rewards/tag_count_reward/std": 0.22593727111816406, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1373.0, "completions/max_terminated_length": 707.4, "completions/mean_length": 209.70390625, "completions/mean_terminated_length": 203.64548950195314, "completions/min_length": 37.2, "completions/min_terminated_length": 37.2, "epoch": 0.872631115028647, "grad_norm": 0.8070043867292849, "kl": 0.1049560546875, "learning_rate": 1e-06, "loss": -0.0094, "num_tokens": 56709412.0, "reward": 0.6168586254119873, "reward_std": 0.20435989499092103, "rewards/format_reward/mean": 0.85546875, "rewards/format_reward/std": 0.34868985414505005, "rewards/qatch_metrics/mean": 0.5714651107788086, "rewards/qatch_metrics/std": 0.42900125980377196, "rewards/tag_count_reward/mean": 0.911328125, "rewards/tag_count_reward/std": 0.22259356081485748, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1373.4, "completions/max_terminated_length": 661.4, "completions/mean_length": 218.89609375, "completions/mean_terminated_length": 215.85523681640626, "completions/min_length": 38.6, "completions/min_terminated_length": 38.6, "epoch": 0.881445570736007, "grad_norm": 0.8578105605653167, "kl": 0.1009765625, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 57464591.0, "reward": 0.5597202479839325, "reward_std": 0.22529322803020477, "rewards/format_reward/mean": 0.865625, "rewards/format_reward/std": 0.3411052882671356, "rewards/qatch_metrics/mean": 0.5026809990406036, "rewards/qatch_metrics/std": 0.4211664915084839, "rewards/tag_count_reward/mean": 0.917578125, "rewards/tag_count_reward/std": 0.21666719317436217, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1334.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 209.959375, "completions/mean_terminated_length": 203.84853515625, "completions/min_length": 26.2, "completions/min_terminated_length": 26.2, "epoch": 0.8902600264433671, "grad_norm": 0.8021142913277886, "kl": 0.108203125, "learning_rate": 1e-06, "loss": -0.0275, "num_tokens": 58213131.0, "reward": 0.5699510633945465, "reward_std": 0.2101448118686676, "rewards/format_reward/mean": 0.82578125, "rewards/format_reward/std": 0.3783671915531158, "rewards/qatch_metrics/mean": 0.5208638191223145, "rewards/qatch_metrics/std": 0.41592952609062195, "rewards/tag_count_reward/mean": 0.8927734375, "rewards/tag_count_reward/std": 0.2440448522567749, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 2336.6, "completions/max_terminated_length": 925.2, "completions/mean_length": 245.92890625, "completions/mean_terminated_length": 230.83008422851563, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.8990744821507272, "grad_norm": 0.7883812832224967, "kl": 0.100732421875, "learning_rate": 1e-06, "loss": 0.0226, "num_tokens": 59018080.0, "reward": 0.5421915054321289, "reward_std": 0.21591668128967284, "rewards/format_reward/mean": 0.80546875, "rewards/format_reward/std": 0.3959254801273346, "rewards/qatch_metrics/mean": 0.49160627126693723, "rewards/qatch_metrics/std": 0.41863099932670594, "rewards/tag_count_reward/mean": 0.8755859375, "rewards/tag_count_reward/std": 0.2644981533288956, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.4, "completions/max_terminated_length": 618.4, "completions/mean_length": 208.48984375, "completions/mean_terminated_length": 208.48984375, "completions/min_length": 34.8, "completions/min_terminated_length": 34.8, "epoch": 0.9078889378580872, "grad_norm": 0.903154309567232, "kl": 0.111083984375, "learning_rate": 1e-06, "loss": -0.0072, "num_tokens": 59739139.0, "reward": 0.632229495048523, "reward_std": 0.19765791296958923, "rewards/format_reward/mean": 0.83125, "rewards/format_reward/std": 0.368955659866333, "rewards/qatch_metrics/mean": 0.593443238735199, "rewards/qatch_metrics/std": 0.4310678899288177, "rewards/tag_count_reward/mean": 0.8935546875, "rewards/tag_count_reward/std": 0.23964128494262696, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1252.4, "completions/max_terminated_length": 611.6, "completions/mean_length": 212.459375, "completions/mean_terminated_length": 209.4299072265625, "completions/min_length": 33.4, "completions/min_terminated_length": 33.4, "epoch": 0.9167033935654474, "grad_norm": 0.8346843956683994, "kl": 0.1077392578125, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 60466559.0, "reward": 0.5584656774997712, "reward_std": 0.23274661898612975, "rewards/format_reward/mean": 0.8796875, "rewards/format_reward/std": 0.32524962425231935, "rewards/qatch_metrics/mean": 0.4991369664669037, "rewards/qatch_metrics/std": 0.41381397247314455, "rewards/tag_count_reward/mean": 0.924609375, "rewards/tag_count_reward/std": 0.21089179813861847, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00078125, "completions/max_length": 1248.4, "completions/max_terminated_length": 638.4, "completions/mean_length": 210.87109375, "completions/mean_terminated_length": 207.83172912597655, "completions/min_length": 28.4, "completions/min_terminated_length": 28.4, "epoch": 0.9255178492728074, "grad_norm": 0.9740281006839744, "kl": 0.1169189453125, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 61228986.0, "reward": 0.5737495183944702, "reward_std": 0.221232670545578, "rewards/format_reward/mean": 0.8328125, "rewards/format_reward/std": 0.3710750341415405, "rewards/qatch_metrics/mean": 0.5245283961296081, "rewards/qatch_metrics/std": 0.42740072011947633, "rewards/tag_count_reward/mean": 0.8923828125, "rewards/tag_count_reward/std": 0.24619007110595703, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 2158.4, "completions/max_terminated_length": 753.4, "completions/mean_length": 201.10390625, "completions/mean_terminated_length": 191.97602233886718, "completions/min_length": 21.4, "completions/min_terminated_length": 21.4, "epoch": 0.9343323049801675, "grad_norm": 0.8087287498541261, "kl": 0.1182373046875, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 61969759.0, "reward": 0.5955200791358948, "reward_std": 0.2014760673046112, "rewards/format_reward/mean": 0.84140625, "rewards/format_reward/std": 0.35906914472579954, "rewards/qatch_metrics/mean": 0.5486127734184265, "rewards/qatch_metrics/std": 0.39445692896842954, "rewards/tag_count_reward/mean": 0.901171875, "rewards/tag_count_reward/std": 0.23216934502124786, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 1311.6, "completions/max_terminated_length": 636.6, "completions/mean_length": 200.63671875, "completions/mean_terminated_length": 194.58396911621094, "completions/min_length": 31.6, "completions/min_terminated_length": 31.6, "epoch": 0.9431467606875276, "grad_norm": 1.0244359727988313, "kl": 0.1111083984375, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 62673742.0, "reward": 0.5657651543617248, "reward_std": 0.18060422837734222, "rewards/format_reward/mean": 0.88671875, "rewards/format_reward/std": 0.3158248126506805, "rewards/qatch_metrics/mean": 0.5065067887306214, "rewards/qatch_metrics/std": 0.3941995918750763, "rewards/tag_count_reward/mean": 0.93125, "rewards/tag_count_reward/std": 0.1997167259454727, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 1975.2, "completions/max_terminated_length": 528.8, "completions/mean_length": 192.11484375, "completions/mean_terminated_length": 182.94750061035157, "completions/min_length": 21.8, "completions/min_terminated_length": 21.8, "epoch": 0.9519612163948876, "grad_norm": 0.9450830973150219, "kl": 0.106689453125, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 63388433.0, "reward": 0.5951115846633911, "reward_std": 0.19603927731513976, "rewards/format_reward/mean": 0.859375, "rewards/format_reward/std": 0.34651567935943606, "rewards/qatch_metrics/mean": 0.5458343744277954, "rewards/qatch_metrics/std": 0.4278919756412506, "rewards/tag_count_reward/mean": 0.904296875, "rewards/tag_count_reward/std": 0.24298664927482605, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 1540.4, "completions/max_terminated_length": 1090.8, "completions/mean_length": 190.159375, "completions/mean_terminated_length": 181.03360290527343, "completions/min_length": 35.8, "completions/min_terminated_length": 35.8, "epoch": 0.9607756721022477, "grad_norm": 0.9291286887400723, "kl": 0.1108642578125, "learning_rate": 1e-06, "loss": 0.0375, "num_tokens": 64092413.0, "reward": 0.6795460700988769, "reward_std": 0.20811468064785005, "rewards/format_reward/mean": 0.91328125, "rewards/format_reward/std": 0.280303093791008, "rewards/qatch_metrics/mean": 0.6362651109695434, "rewards/qatch_metrics/std": 0.4117369055747986, "rewards/tag_count_reward/mean": 0.9478515625, "rewards/tag_count_reward/std": 0.18190329372882844, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.4, "completions/max_terminated_length": 560.4, "completions/mean_length": 181.5375, "completions/mean_terminated_length": 181.5375, "completions/min_length": 35.6, "completions/min_terminated_length": 35.6, "epoch": 0.9695901278096077, "grad_norm": 0.9490785592275803, "kl": 0.1166015625, "learning_rate": 1e-06, "loss": -0.0105, "num_tokens": 64759389.0, "reward": 0.577846372127533, "reward_std": 0.19823800325393676, "rewards/format_reward/mean": 0.91328125, "rewards/format_reward/std": 0.28074146509170533, "rewards/qatch_metrics/mean": 0.5166414439678192, "rewards/qatch_metrics/std": 0.4310955286026001, "rewards/tag_count_reward/mean": 0.9474609375, "rewards/tag_count_reward/std": 0.1789928376674652, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 2067.2, "completions/max_terminated_length": 755.6, "completions/mean_length": 197.115625, "completions/mean_terminated_length": 191.02464904785157, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.9784045835169678, "grad_norm": 0.90442977548872, "kl": 0.108544921875, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 65477873.0, "reward": 0.5961843609809876, "reward_std": 0.20585475862026215, "rewards/format_reward/mean": 0.92421875, "rewards/format_reward/std": 0.2640294134616852, "rewards/qatch_metrics/mean": 0.5363083481788635, "rewards/qatch_metrics/std": 0.41726168990135193, "rewards/tag_count_reward/mean": 0.9580078125, "rewards/tag_count_reward/std": 0.1562621772289276, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 3376.8, "completions/max_terminated_length": 587.8, "completions/mean_length": 195.74296875, "completions/mean_terminated_length": 183.5094757080078, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "epoch": 0.9872190392243279, "grad_norm": 0.9502895043232452, "kl": 0.112939453125, "learning_rate": 1e-06, "loss": 0.0127, "num_tokens": 66214488.0, "reward": 0.6302931666374206, "reward_std": 0.21948930323123933, "rewards/format_reward/mean": 0.88515625, "rewards/format_reward/std": 0.31716270446777345, "rewards/qatch_metrics/mean": 0.5827322959899902, "rewards/qatch_metrics/std": 0.4163429081439972, "rewards/tag_count_reward/mean": 0.9291015625, "rewards/tag_count_reward/std": 0.20648659765720367, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003125, "completions/max_length": 3388.8, "completions/max_terminated_length": 552.8, "completions/mean_length": 203.7375, "completions/mean_terminated_length": 191.53135375976564, "completions/min_length": 29.8, "completions/min_terminated_length": 29.8, "epoch": 0.996033494931688, "grad_norm": 1.0451866474159504, "kl": 0.110400390625, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 66948152.0, "reward": 0.5372519016265869, "reward_std": 0.20013673603534698, "rewards/format_reward/mean": 0.8640625, "rewards/format_reward/std": 0.34239274859428404, "rewards/qatch_metrics/mean": 0.4767416715621948, "rewards/qatch_metrics/std": 0.3842666923999786, "rewards/tag_count_reward/mean": 0.9123046875, "rewards/tag_count_reward/std": 0.23215168714523315, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 2463.5, "completions/max_terminated_length": 763.0, "completions/mean_length": 191.5, "completions/mean_terminated_length": 183.86861419677734, "completions/min_length": 30.5, "completions/min_terminated_length": 30.5, "epoch": 0.999559277214632, "kl": 0.11181640625, "num_tokens": 67212456.0, "reward": 0.6837565302848816, "reward_std": 0.19374996423721313, "rewards/format_reward/mean": 0.8828125, "rewards/format_reward/std": 0.32204362750053406, "rewards/qatch_metrics/mean": 0.6466471254825592, "rewards/qatch_metrics/std": 0.3814842849969864, "rewards/tag_count_reward/mean": 0.91650390625, "rewards/tag_count_reward/std": 0.2326364442706108, "step": 567, "total_flos": 0.0, "train_loss": 0.0005930395028184331, "train_runtime": 32371.3135, "train_samples_per_second": 0.28, "train_steps_per_second": 0.018 } ], "logging_steps": 5, "max_steps": 567, "num_input_tokens_seen": 67212456, "num_train_epochs": 1, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }