{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.23337222870478413, "eval_steps": 500, "global_step": 2200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1719.0, "completions/mean_length": 680.875, "completions/mean_terminated_length": 427.7037048339844, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.00010607828577490188, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.0, "learning_rate": 8e-06, "loss": 0.3843, "num_tokens": 79324.0, "reward": 1.4974994659423828, "reward_std": 0.8458996415138245, "rewards/reward_fn/mean": 1.4974994659423828, "rewards/reward_fn/std": 0.8458995819091797, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1127.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 215.25, "completions/mean_terminated_length": 215.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.00021215657154980376, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.008525475102942437, "learning_rate": 7.9996e-06, "loss": 0.0045, "num_tokens": 116452.0, "reward": 2.7323050498962402, "reward_std": 0.185869961977005, "rewards/reward_fn/mean": 2.7323050498962402, "rewards/reward_fn/std": 0.1858699470758438, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 81.03125, "completions/mean_terminated_length": 81.03125, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.0003182348573247056, "frac_reward_zero_std": 1.0, "grad_norm": 0.2158203125, "kl": 0.01689625042490661, "learning_rate": 7.9992e-06, "loss": 0.0007, "num_tokens": 155109.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 332.375, "completions/mean_terminated_length": 332.375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.0004243131430996075, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.007169858436100185, "learning_rate": 7.9988e-06, "loss": 0.0173, "num_tokens": 206385.0, "reward": 2.862459897994995, "reward_std": 0.067531056702137, "rewards/reward_fn/mean": 2.862459897994995, "rewards/reward_fn/std": 0.067531056702137, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 239.96875, "completions/mean_terminated_length": 239.96875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.0005303914288745094, "frac_reward_zero_std": 1.0, "grad_norm": 0.08251953125, "kl": 0.012080345884896815, "learning_rate": 7.9984e-06, "loss": 0.0005, "num_tokens": 246896.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 268.71875, "completions/mean_terminated_length": 268.71875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.0006364697146494112, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.011115878820419312, "learning_rate": 7.998e-06, "loss": 0.0878, "num_tokens": 284999.0, "reward": 3.22149658203125, "reward_std": 0.9693878889083862, "rewards/reward_fn/mean": 3.22149658203125, "rewards/reward_fn/std": 0.969387948513031, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1717.0, "completions/max_terminated_length": 1717.0, "completions/mean_length": 400.0, "completions/mean_terminated_length": 400.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.0007425480004243131, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.007328710809815675, "learning_rate": 7.9976e-06, "loss": -0.119, "num_tokens": 331847.0, "reward": 2.9880638122558594, "reward_std": 0.7341222763061523, "rewards/reward_fn/mean": 2.9880638122558594, "rewards/reward_fn/std": 0.7341222763061523, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1423.0, "completions/mean_length": 340.5625, "completions/mean_terminated_length": 285.4838562011719, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.000848626286199215, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.00942237873096019, "learning_rate": 7.9972e-06, "loss": 0.3223, "num_tokens": 384729.0, "reward": 3.8373069763183594, "reward_std": 0.731940746307373, "rewards/reward_fn/mean": 3.8373069763183594, "rewards/reward_fn/std": 0.731940746307373, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 165.0625, "completions/mean_terminated_length": 165.0625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.0009547045719741168, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.013073404785245657, "learning_rate": 7.9968e-06, "loss": 0.0005, "num_tokens": 422107.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 356.03125, "completions/mean_terminated_length": 356.03125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.0010607828577490189, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.008752175432164222, "learning_rate": 7.9964e-06, "loss": 0.0989, "num_tokens": 488924.0, "reward": 3.7715067863464355, "reward_std": 0.5657153129577637, "rewards/reward_fn/mean": 3.7715067863464355, "rewards/reward_fn/std": 0.5657153725624084, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 132.59375, "completions/mean_terminated_length": 132.59375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.0011668611435239206, "frac_reward_zero_std": 0.0, "grad_norm": 2.796875, "kl": 0.013385511934757233, "learning_rate": 7.996e-06, "loss": -0.0505, "num_tokens": 529903.0, "reward": 3.9310073852539062, "reward_std": 0.2188190072774887, "rewards/reward_fn/mean": 3.9310073852539062, "rewards/reward_fn/std": 0.2188190221786499, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 254.96875, "completions/mean_terminated_length": 254.96875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.0012729394292988225, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.011238898383453488, "learning_rate": 7.995599999999998e-06, "loss": 0.0004, "num_tokens": 570670.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1664.0, "completions/max_terminated_length": 1664.0, "completions/mean_length": 629.1875, "completions/mean_terminated_length": 629.1875, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.0013790177150737244, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.007070357620250434, "learning_rate": 7.9952e-06, "loss": 0.0796, "num_tokens": 638228.0, "reward": 3.1080098152160645, "reward_std": 0.934657096862793, "rewards/reward_fn/mean": 3.1080098152160645, "rewards/reward_fn/std": 0.934657096862793, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 623.21875, "completions/mean_terminated_length": 475.82757568359375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.0014850960008486263, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.007602842408232391, "learning_rate": 7.9948e-06, "loss": 0.0701, "num_tokens": 698267.0, "reward": 2.447237014770508, "reward_std": 0.8652064204216003, "rewards/reward_fn/mean": 2.447237014770508, "rewards/reward_fn/std": 0.8652064204216003, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 247.125, "completions/mean_terminated_length": 247.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.0015911742866235282, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.012013918720185757, "learning_rate": 7.9944e-06, "loss": -0.0071, "num_tokens": 745183.0, "reward": 3.933253765106201, "reward_std": 0.2631880044937134, "rewards/reward_fn/mean": 3.933253765106201, "rewards/reward_fn/std": 0.263187974691391, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1180.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 416.46875, "completions/mean_terminated_length": 416.46875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.00169725257239843, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.01027172978501767, "learning_rate": 7.994e-06, "loss": 0.0592, "num_tokens": 789742.0, "reward": 2.780426502227783, "reward_std": 0.21798433363437653, "rewards/reward_fn/mean": 2.780426502227783, "rewards/reward_fn/std": 0.21798425912857056, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 223.59375, "completions/mean_terminated_length": 223.59375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.001803330858173332, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.013605716056190431, "learning_rate": 7.9936e-06, "loss": 0.012, "num_tokens": 835201.0, "reward": 2.994292736053467, "reward_std": 0.4485239088535309, "rewards/reward_fn/mean": 2.994292736053467, "rewards/reward_fn/std": 0.4485238790512085, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 200.9375, "completions/mean_terminated_length": 200.9375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.0019094091439482337, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.011645498918369412, "learning_rate": 7.9932e-06, "loss": 0.0843, "num_tokens": 862143.0, "reward": 3.747105121612549, "reward_std": 0.5640392899513245, "rewards/reward_fn/mean": 3.747105121612549, "rewards/reward_fn/std": 0.5640392899513245, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 387.59375, "completions/mean_terminated_length": 387.59375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.002015487429723136, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.007694089494179934, "learning_rate": 7.992799999999999e-06, "loss": 0.0621, "num_tokens": 910226.0, "reward": 3.5295071601867676, "reward_std": 0.6173213124275208, "rewards/reward_fn/mean": 3.5295071601867676, "rewards/reward_fn/std": 0.617321252822876, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 208.96875, "completions/mean_terminated_length": 208.96875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.0021215657154980377, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.015368042862974107, "learning_rate": 7.9924e-06, "loss": 0.0099, "num_tokens": 940337.0, "reward": 3.5482964515686035, "reward_std": 0.8371409177780151, "rewards/reward_fn/mean": 3.5482964515686035, "rewards/reward_fn/std": 0.8371408581733704, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.0022276440012729396, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.014460677281022072, "learning_rate": 7.991999999999999e-06, "loss": -0.0116, "num_tokens": 960817.0, "reward": 3.8916022777557373, "reward_std": 0.45123252272605896, "rewards/reward_fn/mean": 3.8916022777557373, "rewards/reward_fn/std": 0.4512324333190918, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 380.53125, "completions/mean_terminated_length": 380.53125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.002333722287047841, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.010017184424214065, "learning_rate": 7.9916e-06, "loss": 0.0591, "num_tokens": 1025506.0, "reward": 2.726224184036255, "reward_std": 0.2123180776834488, "rewards/reward_fn/mean": 2.726224184036255, "rewards/reward_fn/std": 0.2123180478811264, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 537.15625, "completions/mean_terminated_length": 537.15625, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.002439800572822743, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.008808196464087814, "learning_rate": 7.991199999999999e-06, "loss": -0.0658, "num_tokens": 1081447.0, "reward": 2.7414865493774414, "reward_std": 0.947241485118866, "rewards/reward_fn/mean": 2.7414865493774414, "rewards/reward_fn/std": 0.9472415447235107, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 412.5625, "completions/mean_terminated_length": 412.5625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.002545878858597645, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.009269667672924697, "learning_rate": 7.9908e-06, "loss": -0.0071, "num_tokens": 1141273.0, "reward": 2.676821231842041, "reward_std": 0.5632266998291016, "rewards/reward_fn/mean": 2.676821231842041, "rewards/reward_fn/std": 0.5632267594337463, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 322.75, "completions/mean_terminated_length": 322.75, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.002651957144372547, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.011170931567903608, "learning_rate": 7.9904e-06, "loss": 0.0352, "num_tokens": 1196913.0, "reward": 3.0973877906799316, "reward_std": 0.7358191609382629, "rewards/reward_fn/mean": 3.0973877906799316, "rewards/reward_fn/std": 0.7358191609382629, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 245.03125, "completions/mean_terminated_length": 245.03125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.0027580354301474487, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.01611144037451595, "learning_rate": 7.99e-06, "loss": 0.1088, "num_tokens": 1239122.0, "reward": 2.7557084560394287, "reward_std": 0.3039965033531189, "rewards/reward_fn/mean": 2.7557084560394287, "rewards/reward_fn/std": 0.3039964735507965, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 174.46875, "completions/mean_terminated_length": 174.46875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.0028641137159223506, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.016508221509866416, "learning_rate": 7.9896e-06, "loss": 0.0277, "num_tokens": 1273281.0, "reward": 3.8924479484558105, "reward_std": 0.4453147053718567, "rewards/reward_fn/mean": 3.8924479484558105, "rewards/reward_fn/std": 0.4453147053718567, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 229.53125, "completions/mean_terminated_length": 229.53125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.0029701920016972526, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.01914752251468599, "learning_rate": 7.9892e-06, "loss": -0.0023, "num_tokens": 1329714.0, "reward": 3.8488998413085938, "reward_std": 0.40621453523635864, "rewards/reward_fn/mean": 3.8488998413085938, "rewards/reward_fn/std": 0.40621453523635864, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 632.59375, "completions/mean_terminated_length": 538.2333374023438, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.0030762702874721545, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.01128252933267504, "learning_rate": 7.9888e-06, "loss": 0.2658, "num_tokens": 1391141.0, "reward": 2.293217658996582, "reward_std": 0.7491805553436279, "rewards/reward_fn/mean": 2.293217658996582, "rewards/reward_fn/std": 0.7491805553436279, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 457.8125, "completions/mean_terminated_length": 457.8125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.0031823485732470564, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.00988059863448143, "learning_rate": 7.9884e-06, "loss": -0.0261, "num_tokens": 1421887.0, "reward": 1.8365099430084229, "reward_std": 0.3920939862728119, "rewards/reward_fn/mean": 1.8365099430084229, "rewards/reward_fn/std": 0.3920939564704895, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 234.03125, "completions/mean_terminated_length": 234.03125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.0032884268590219583, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.016112850280478597, "learning_rate": 7.988e-06, "loss": 0.0618, "num_tokens": 1464800.0, "reward": 3.961313009262085, "reward_std": 0.21884705126285553, "rewards/reward_fn/mean": 3.961313009262085, "rewards/reward_fn/std": 0.21884708106517792, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 319.59375, "completions/mean_terminated_length": 319.59375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.00339450514479686, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.015100088901817799, "learning_rate": 7.9876e-06, "loss": 0.1985, "num_tokens": 1511891.0, "reward": 2.7325994968414307, "reward_std": 0.19626843929290771, "rewards/reward_fn/mean": 2.7325994968414307, "rewards/reward_fn/std": 0.19626840949058533, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 215.96875, "completions/mean_terminated_length": 215.96875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.003500583430571762, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.013935832539573312, "learning_rate": 7.987199999999999e-06, "loss": 0.076, "num_tokens": 1553202.0, "reward": 3.828774929046631, "reward_std": 0.564949631690979, "rewards/reward_fn/mean": 3.828774929046631, "rewards/reward_fn/std": 0.564949631690979, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 220.25, "completions/mean_terminated_length": 220.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.003606661716346664, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.021082005463540554, "learning_rate": 7.9868e-06, "loss": 0.0509, "num_tokens": 1598394.0, "reward": 3.139282464981079, "reward_std": 0.714324951171875, "rewards/reward_fn/mean": 3.139282464981079, "rewards/reward_fn/std": 0.7143248915672302, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 389.8125, "completions/mean_terminated_length": 336.32257080078125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.003712740002121566, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.014661112450994551, "learning_rate": 7.986399999999999e-06, "loss": 0.1028, "num_tokens": 1643572.0, "reward": 2.1648662090301514, "reward_std": 0.8039337992668152, "rewards/reward_fn/mean": 2.1648662090301514, "rewards/reward_fn/std": 0.8039337396621704, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 271.8125, "completions/mean_terminated_length": 271.8125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.0038188182878964674, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.017005216563120484, "learning_rate": 7.986e-06, "loss": -0.0162, "num_tokens": 1687534.0, "reward": 2.9149179458618164, "reward_std": 0.5480148196220398, "rewards/reward_fn/mean": 2.9149179458618164, "rewards/reward_fn/std": 0.548014760017395, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 226.96875, "completions/mean_terminated_length": 226.96875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.00392489657367137, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.019651910522952676, "learning_rate": 7.9856e-06, "loss": -0.0033, "num_tokens": 1750637.0, "reward": 3.9629361629486084, "reward_std": 0.20966489613056183, "rewards/reward_fn/mean": 3.9629361629486084, "rewards/reward_fn/std": 0.2096649408340454, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 197.71875, "completions/mean_terminated_length": 197.71875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.004030974859446272, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.011529933894053102, "learning_rate": 7.9852e-06, "loss": 0.0005, "num_tokens": 1800452.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 299.71875, "completions/mean_terminated_length": 299.71875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.0041370531452211735, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.01613133493810892, "learning_rate": 7.9848e-06, "loss": 0.0467, "num_tokens": 1848923.0, "reward": 3.5235185623168945, "reward_std": 0.5581008195877075, "rewards/reward_fn/mean": 3.5235185623168945, "rewards/reward_fn/std": 0.5581007599830627, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.0042431314309960754, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.01886003592517227, "learning_rate": 7.9844e-06, "loss": 0.0008, "num_tokens": 1891123.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 200.5, "completions/mean_terminated_length": 200.5, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.004349209716770977, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.018136992235668004, "learning_rate": 7.984e-06, "loss": 0.0007, "num_tokens": 1931971.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 967.84375, "completions/mean_terminated_length": 856.1034545898438, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.004455288002545879, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.010472592199221253, "learning_rate": 7.9836e-06, "loss": 0.2691, "num_tokens": 1998782.0, "reward": 1.5388916730880737, "reward_std": 0.5033364295959473, "rewards/reward_fn/mean": 1.5388916730880737, "rewards/reward_fn/std": 0.5033363699913025, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 342.9375, "completions/mean_terminated_length": 287.93548583984375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.00456136628832078, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.013179327361285686, "learning_rate": 7.9832e-06, "loss": 0.1886, "num_tokens": 2051996.0, "reward": 3.7020716667175293, "reward_std": 0.7320355772972107, "rewards/reward_fn/mean": 3.7020716667175293, "rewards/reward_fn/std": 0.7320355772972107, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 215.46875, "completions/mean_terminated_length": 215.46875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.004667444574095682, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.020010275882668793, "learning_rate": 7.9828e-06, "loss": 0.0128, "num_tokens": 2097803.0, "reward": 2.742847204208374, "reward_std": 0.29240480065345764, "rewards/reward_fn/mean": 2.742847204208374, "rewards/reward_fn/std": 0.2924048602581024, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 160.34375, "completions/mean_terminated_length": 160.34375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.004773522859870584, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.014989322167821229, "learning_rate": 7.9824e-06, "loss": 0.1524, "num_tokens": 2121654.0, "reward": 3.8570432662963867, "reward_std": 0.3882334530353546, "rewards/reward_fn/mean": 3.8570432662963867, "rewards/reward_fn/std": 0.3882334530353546, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 79.71875, "completions/mean_terminated_length": 79.71875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.004879601145645486, "frac_reward_zero_std": 1.0, "grad_norm": 0.28515625, "kl": 0.026190617179963738, "learning_rate": 7.981999999999999e-06, "loss": 0.001, "num_tokens": 2166573.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1247.0, "completions/max_terminated_length": 1247.0, "completions/mean_length": 449.5, "completions/mean_terminated_length": 449.5, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.004985679431420388, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.012275198707357049, "learning_rate": 7.9816e-06, "loss": 0.0033, "num_tokens": 2217437.0, "reward": 2.856170654296875, "reward_std": 0.6345119476318359, "rewards/reward_fn/mean": 2.856170654296875, "rewards/reward_fn/std": 0.6345118880271912, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 306.9375, "completions/mean_terminated_length": 306.9375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.00509175771719529, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.01920297823380679, "learning_rate": 7.9812e-06, "loss": -0.0138, "num_tokens": 2275419.0, "reward": 3.279310703277588, "reward_std": 1.086318016052246, "rewards/reward_fn/mean": 3.279310703277588, "rewards/reward_fn/std": 1.086318016052246, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1867.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 594.375, "completions/mean_terminated_length": 594.375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.005197836002970192, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.012267176411114633, "learning_rate": 7.9808e-06, "loss": -0.0321, "num_tokens": 2330535.0, "reward": 2.3844780921936035, "reward_std": 0.69173663854599, "rewards/reward_fn/mean": 2.3844780921936035, "rewards/reward_fn/std": 0.6917366981506348, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 249.34375, "completions/mean_terminated_length": 249.34375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.005303914288745094, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.01791852479800582, "learning_rate": 7.9804e-06, "loss": 0.0302, "num_tokens": 2371474.0, "reward": 3.0003445148468018, "reward_std": 0.6457379460334778, "rewards/reward_fn/mean": 3.0003445148468018, "rewards/reward_fn/std": 0.6457379460334778, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 387.53125, "completions/mean_terminated_length": 333.9677429199219, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.0054099925745199956, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.013383313198573887, "learning_rate": 7.98e-06, "loss": 0.1146, "num_tokens": 2425219.0, "reward": 2.647240161895752, "reward_std": 0.7234665155410767, "rewards/reward_fn/mean": 2.647240161895752, "rewards/reward_fn/std": 0.7234665155410767, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.0055160708602948975, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.022899470990523696, "learning_rate": 7.979599999999999e-06, "loss": -0.0405, "num_tokens": 2461511.0, "reward": 3.963773488998413, "reward_std": 0.20492826402187347, "rewards/reward_fn/mean": 3.963773488998413, "rewards/reward_fn/std": 0.20492829382419586, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 172.5625, "completions/mean_terminated_length": 172.5625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.005622149146069799, "frac_reward_zero_std": 1.0, "grad_norm": 0.1015625, "kl": 0.01780761929694563, "learning_rate": 7.9792e-06, "loss": 0.0007, "num_tokens": 2512409.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 513.375, "completions/mean_terminated_length": 513.375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.005728227431844701, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.014814406633377075, "learning_rate": 7.978799999999999e-06, "loss": 0.0129, "num_tokens": 2564133.0, "reward": 2.456674575805664, "reward_std": 0.6039776802062988, "rewards/reward_fn/mean": 2.456674575805664, "rewards/reward_fn/std": 0.603977620601654, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1566.0, "completions/mean_length": 824.21875, "completions/mean_terminated_length": 649.3928833007812, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.005834305717619603, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.01389779313467443, "learning_rate": 7.9784e-06, "loss": 0.3627, "num_tokens": 2627980.0, "reward": 2.1529626846313477, "reward_std": 0.9413007497787476, "rewards/reward_fn/mean": 2.1529626846313477, "rewards/reward_fn/std": 0.9413006901741028, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1513.0, "completions/mean_length": 473.125, "completions/mean_terminated_length": 422.32257080078125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.005940384003394505, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.013784538023173809, "learning_rate": 7.977999999999999e-06, "loss": 0.1598, "num_tokens": 2673968.0, "reward": 3.6430978775024414, "reward_std": 0.9269182682037354, "rewards/reward_fn/mean": 3.6430978775024414, "rewards/reward_fn/std": 0.9269182682037354, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 157.90625, "completions/mean_terminated_length": 157.90625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.006046462289169407, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.013763216906227171, "learning_rate": 7.9776e-06, "loss": 0.0307, "num_tokens": 2732653.0, "reward": 3.931234836578369, "reward_std": 0.3889950215816498, "rewards/reward_fn/mean": 3.931234836578369, "rewards/reward_fn/std": 0.3889950215816498, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 297.25, "completions/mean_terminated_length": 297.25, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.006152540574944309, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.011821337277069688, "learning_rate": 7.977199999999999e-06, "loss": 0.1431, "num_tokens": 2788469.0, "reward": 3.0665931701660156, "reward_std": 0.08242907375097275, "rewards/reward_fn/mean": 3.0665931701660156, "rewards/reward_fn/std": 0.08242906630039215, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 193.65625, "completions/mean_terminated_length": 193.65625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.006258618860719211, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.021005364251323044, "learning_rate": 7.9768e-06, "loss": 0.0154, "num_tokens": 2829578.0, "reward": 2.730905532836914, "reward_std": 0.29996997117996216, "rewards/reward_fn/mean": 2.730905532836914, "rewards/reward_fn/std": 0.29996997117996216, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1575.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 298.59375, "completions/mean_terminated_length": 298.59375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.006364697146494113, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.018938414519652724, "learning_rate": 7.9764e-06, "loss": -0.063, "num_tokens": 2892893.0, "reward": 3.3888978958129883, "reward_std": 0.7617525458335876, "rewards/reward_fn/mean": 3.3888978958129883, "rewards/reward_fn/std": 0.7617525458335876, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 214.6875, "completions/mean_terminated_length": 214.6875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.006470775432269015, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.020086537464521825, "learning_rate": 7.976e-06, "loss": -0.0565, "num_tokens": 2946067.0, "reward": 3.9270145893096924, "reward_std": 0.4128677546977997, "rewards/reward_fn/mean": 3.9270145893096924, "rewards/reward_fn/std": 0.4128677546977997, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1099.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 351.71875, "completions/mean_terminated_length": 351.71875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.0065768537180439166, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.015736156376078725, "learning_rate": 7.9756e-06, "loss": 0.0296, "num_tokens": 2986666.0, "reward": 2.2112362384796143, "reward_std": 0.4881065785884857, "rewards/reward_fn/mean": 2.2112362384796143, "rewards/reward_fn/std": 0.4881065785884857, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 321.15625, "completions/mean_terminated_length": 321.15625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.0066829320038188185, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.020156427985057235, "learning_rate": 7.9752e-06, "loss": -0.0779, "num_tokens": 3028399.0, "reward": 3.290408134460449, "reward_std": 0.9479041695594788, "rewards/reward_fn/mean": 3.290408134460449, "rewards/reward_fn/std": 0.9479042291641235, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 723.0, "completions/max_terminated_length": 723.0, "completions/mean_length": 186.84375, "completions/mean_terminated_length": 186.84375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.00678901028959372, "frac_reward_zero_std": 1.0, "grad_norm": 0.12451171875, "kl": 0.020627434947527945, "learning_rate": 7.9748e-06, "loss": 0.0008, "num_tokens": 3068138.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 366.90625, "completions/mean_terminated_length": 366.90625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.006895088575368622, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.016033415216952562, "learning_rate": 7.9744e-06, "loss": -0.0287, "num_tokens": 3117607.0, "reward": 3.5806355476379395, "reward_std": 0.6227185130119324, "rewards/reward_fn/mean": 3.5806355476379395, "rewards/reward_fn/std": 0.6227185130119324, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 466.875, "completions/mean_terminated_length": 466.875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.007001166861143524, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.014832770335488021, "learning_rate": 7.974e-06, "loss": -0.007, "num_tokens": 3199139.0, "reward": 3.7774386405944824, "reward_std": 0.7030869126319885, "rewards/reward_fn/mean": 3.7774386405944824, "rewards/reward_fn/std": 0.7030869126319885, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1649.0, "completions/max_terminated_length": 1649.0, "completions/mean_length": 424.65625, "completions/mean_terminated_length": 424.65625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.007107245146918426, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.014696288155391812, "learning_rate": 7.9736e-06, "loss": -0.0192, "num_tokens": 3249912.0, "reward": 3.6228115558624268, "reward_std": 0.7114648818969727, "rewards/reward_fn/mean": 3.6228115558624268, "rewards/reward_fn/std": 0.7114648818969727, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1291.0, "completions/max_terminated_length": 1291.0, "completions/mean_length": 293.25, "completions/mean_terminated_length": 293.25, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.007213323432693328, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.02517524897120893, "learning_rate": 7.9732e-06, "loss": -0.0745, "num_tokens": 3296032.0, "reward": 2.7536492347717285, "reward_std": 0.27405858039855957, "rewards/reward_fn/mean": 2.7536492347717285, "rewards/reward_fn/std": 0.27405858039855957, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 149.15625, "completions/mean_terminated_length": 149.15625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.00731940171846823, "frac_reward_zero_std": 1.0, "grad_norm": 0.158203125, "kl": 0.02772362041287124, "learning_rate": 7.9728e-06, "loss": 0.0011, "num_tokens": 3344869.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 158.4375, "completions/mean_terminated_length": 158.4375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.007425480004243132, "frac_reward_zero_std": 1.0, "grad_norm": 0.1513671875, "kl": 0.03019587113521993, "learning_rate": 7.9724e-06, "loss": 0.0012, "num_tokens": 3386675.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1326.0, "completions/max_terminated_length": 1326.0, "completions/mean_length": 496.65625, "completions/mean_terminated_length": 496.65625, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.007531558290018034, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.014372375677339733, "learning_rate": 7.972e-06, "loss": -0.0032, "num_tokens": 3450248.0, "reward": 3.8883559703826904, "reward_std": 0.46508049964904785, "rewards/reward_fn/mean": 3.8883559703826904, "rewards/reward_fn/std": 0.46508049964904785, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1761.0, "completions/max_terminated_length": 1761.0, "completions/mean_length": 621.75, "completions/mean_terminated_length": 621.75, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.007637636575792935, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.011013327515684068, "learning_rate": 7.9716e-06, "loss": -0.0858, "num_tokens": 3501984.0, "reward": 2.3477916717529297, "reward_std": 0.5000441074371338, "rewards/reward_fn/mean": 2.3477916717529297, "rewards/reward_fn/std": 0.5000441074371338, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 145.28125, "completions/mean_terminated_length": 145.28125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.007743714861567837, "frac_reward_zero_std": 1.0, "grad_norm": 0.10546875, "kl": 0.015902561601251364, "learning_rate": 7.9712e-06, "loss": 0.0006, "num_tokens": 3547529.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 291.71875, "completions/mean_terminated_length": 291.71875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.00784979314734274, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.018224395578727126, "learning_rate": 7.9708e-06, "loss": 0.0441, "num_tokens": 3604480.0, "reward": 3.9649553298950195, "reward_std": 0.19824209809303284, "rewards/reward_fn/mean": 3.9649553298950195, "rewards/reward_fn/std": 0.19824212789535522, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1167.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 360.15625, "completions/mean_terminated_length": 360.15625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.00795587143311764, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.017522861482575536, "learning_rate": 7.970399999999999e-06, "loss": -0.0247, "num_tokens": 3672165.0, "reward": 3.7407755851745605, "reward_std": 0.573284924030304, "rewards/reward_fn/mean": 3.7407755851745605, "rewards/reward_fn/std": 0.573284924030304, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 99.71875, "completions/mean_terminated_length": 99.71875, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.008061949718892543, "frac_reward_zero_std": 1.0, "grad_norm": 0.25, "kl": 0.03133802697993815, "learning_rate": 7.97e-06, "loss": 0.0013, "num_tokens": 3725628.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 389.125, "completions/mean_terminated_length": 389.125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.008168028004667444, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.014990110415965319, "learning_rate": 7.969599999999999e-06, "loss": 0.2218, "num_tokens": 3776256.0, "reward": 3.630524158477783, "reward_std": 0.8723556399345398, "rewards/reward_fn/mean": 3.630524158477783, "rewards/reward_fn/std": 0.8723556399345398, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 94.4375, "completions/mean_terminated_length": 94.4375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.008274106290442347, "frac_reward_zero_std": 1.0, "grad_norm": 0.169921875, "kl": 0.022003972087986767, "learning_rate": 7.9692e-06, "loss": 0.0009, "num_tokens": 3821742.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 323.96875, "completions/mean_terminated_length": 323.96875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.008380184576217248, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.014573617372661829, "learning_rate": 7.968799999999999e-06, "loss": -0.0192, "num_tokens": 3903693.0, "reward": 3.9266436100006104, "reward_std": 0.4149664342403412, "rewards/reward_fn/mean": 3.9266436100006104, "rewards/reward_fn/std": 0.414966344833374, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1355.0, "completions/max_terminated_length": 1355.0, "completions/mean_length": 343.5, "completions/mean_terminated_length": 343.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.008486262861992151, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.020087406621314585, "learning_rate": 7.9684e-06, "loss": -0.0134, "num_tokens": 3961789.0, "reward": 2.6687002182006836, "reward_std": 0.5339718461036682, "rewards/reward_fn/mean": 2.6687002182006836, "rewards/reward_fn/std": 0.5339718461036682, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 216.4375, "completions/mean_terminated_length": 216.4375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.008592341147767052, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.020208495436236262, "learning_rate": 7.967999999999999e-06, "loss": 0.0008, "num_tokens": 3997675.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 260.25, "completions/mean_terminated_length": 202.5806427001953, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.008698419433541955, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.020875858957879245, "learning_rate": 7.9676e-06, "loss": 0.179, "num_tokens": 4048243.0, "reward": 3.6548068523406982, "reward_std": 0.8150468468666077, "rewards/reward_fn/mean": 3.6548068523406982, "rewards/reward_fn/std": 0.8150468468666077, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1148.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 388.71875, "completions/mean_terminated_length": 388.71875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.008804497719316856, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.01721180323511362, "learning_rate": 7.967199999999999e-06, "loss": 0.0637, "num_tokens": 4095434.0, "reward": 3.598665714263916, "reward_std": 0.7542276978492737, "rewards/reward_fn/mean": 3.598665714263916, "rewards/reward_fn/std": 0.7542277574539185, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 254.9375, "completions/mean_terminated_length": 254.9375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.008910576005091759, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.02156881894916296, "learning_rate": 7.9668e-06, "loss": 0.0009, "num_tokens": 4131016.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2013.0, "completions/mean_length": 667.65625, "completions/mean_terminated_length": 575.6333618164062, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.00901665429086666, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.014661238761618733, "learning_rate": 7.9664e-06, "loss": 0.1817, "num_tokens": 4186685.0, "reward": 3.415998935699463, "reward_std": 1.1627610921859741, "rewards/reward_fn/mean": 3.415998935699463, "rewards/reward_fn/std": 1.1627610921859741, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 220.0625, "completions/mean_terminated_length": 220.0625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.00912273257664156, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.025990084279328585, "learning_rate": 7.966e-06, "loss": 0.001, "num_tokens": 4221695.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 187.3125, "completions/mean_terminated_length": 187.3125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.009228810862416463, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.025184703757986426, "learning_rate": 7.9656e-06, "loss": -0.0135, "num_tokens": 4261897.0, "reward": 3.9673728942871094, "reward_std": 0.18456710875034332, "rewards/reward_fn/mean": 3.9673728942871094, "rewards/reward_fn/std": 0.18456712365150452, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 194.1875, "completions/mean_terminated_length": 194.1875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.009334889148191364, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.022489528637379408, "learning_rate": 7.9652e-06, "loss": -0.0187, "num_tokens": 4299119.0, "reward": 3.961916446685791, "reward_std": 0.21543395519256592, "rewards/reward_fn/mean": 3.961916446685791, "rewards/reward_fn/std": 0.2154339849948883, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 261.6875, "completions/mean_terminated_length": 261.6875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.009440967433966267, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.022605838836170733, "learning_rate": 7.9648e-06, "loss": 0.0996, "num_tokens": 4336069.0, "reward": 3.928297281265259, "reward_std": 0.4056117832660675, "rewards/reward_fn/mean": 3.928297281265259, "rewards/reward_fn/std": 0.4056117534637451, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1217.0, "completions/max_terminated_length": 1217.0, "completions/mean_length": 338.84375, "completions/mean_terminated_length": 338.84375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.009547045719741168, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.01582270860671997, "learning_rate": 7.9644e-06, "loss": 0.0102, "num_tokens": 4382624.0, "reward": 2.761922597885132, "reward_std": 0.05517864227294922, "rewards/reward_fn/mean": 2.761922597885132, "rewards/reward_fn/std": 0.0551786907017231, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 521.09375, "completions/mean_terminated_length": 471.83868408203125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.009653124005516071, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.017371521913446486, "learning_rate": 7.964e-06, "loss": 0.1239, "num_tokens": 4444611.0, "reward": 2.9266998767852783, "reward_std": 0.650079607963562, "rewards/reward_fn/mean": 2.9266998767852783, "rewards/reward_fn/std": 0.6500796675682068, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 721.9375, "completions/mean_terminated_length": 584.7586059570312, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.009759202291290972, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.014051761128939688, "learning_rate": 7.9636e-06, "loss": 0.2817, "num_tokens": 4517121.0, "reward": 2.6343460083007812, "reward_std": 1.013836145401001, "rewards/reward_fn/mean": 2.6343460083007812, "rewards/reward_fn/std": 1.013836145401001, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 215.28125, "completions/mean_terminated_length": 215.28125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.009865280577065875, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.02322612050920725, "learning_rate": 7.963199999999999e-06, "loss": 0.2232, "num_tokens": 4553546.0, "reward": 3.925776481628418, "reward_std": 0.41987186670303345, "rewards/reward_fn/mean": 3.925776481628418, "rewards/reward_fn/std": 0.41987186670303345, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 771.09375, "completions/mean_terminated_length": 639.0, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.009971358862840776, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.013376953662373126, "learning_rate": 7.9628e-06, "loss": 0.2941, "num_tokens": 4608653.0, "reward": 2.248654365539551, "reward_std": 0.8817570805549622, "rewards/reward_fn/mean": 2.248654365539551, "rewards/reward_fn/std": 0.8817570805549622, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1296.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 284.0625, "completions/mean_terminated_length": 284.0625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.010077437148615679, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.023674980737268925, "learning_rate": 7.962399999999999e-06, "loss": 0.1735, "num_tokens": 4647151.0, "reward": 3.8580751419067383, "reward_std": 0.5584725737571716, "rewards/reward_fn/mean": 3.8580751419067383, "rewards/reward_fn/std": 0.5584725737571716, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1389.5625, "completions/mean_terminated_length": 1205.199951171875, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.01018351543439058, "frac_reward_zero_std": 0.0, "grad_norm": 0.6640625, "kl": 0.007312611152883619, "learning_rate": 7.962e-06, "loss": 0.1159, "num_tokens": 4724609.0, "reward": 1.842115879058838, "reward_std": 0.8588997721672058, "rewards/reward_fn/mean": 1.842115879058838, "rewards/reward_fn/std": 0.8588997721672058, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 259.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.010289593720165482, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.024402482667937875, "learning_rate": 7.9616e-06, "loss": -0.0928, "num_tokens": 4767957.0, "reward": 3.3132870197296143, "reward_std": 0.9708902835845947, "rewards/reward_fn/mean": 3.3132870197296143, "rewards/reward_fn/std": 0.9708903431892395, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1990.0, "completions/mean_length": 894.5, "completions/mean_terminated_length": 817.6000366210938, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.010395672005940384, "frac_reward_zero_std": 0.0, "grad_norm": 0.91015625, "kl": 0.00991345732472837, "learning_rate": 7.9612e-06, "loss": 0.1967, "num_tokens": 4841413.0, "reward": 2.7717971801757812, "reward_std": 1.1707860231399536, "rewards/reward_fn/mean": 2.7717971801757812, "rewards/reward_fn/std": 1.1707861423492432, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 496.0, "completions/mean_terminated_length": 445.93548583984375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.010501750291715286, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.016304848017171025, "learning_rate": 7.9608e-06, "loss": 0.2749, "num_tokens": 4894405.0, "reward": 2.716887950897217, "reward_std": 0.5000027418136597, "rewards/reward_fn/mean": 2.716887950897217, "rewards/reward_fn/std": 0.5000027418136597, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 207.90625, "completions/mean_terminated_length": 207.90625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.010607828577490187, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.02409674203954637, "learning_rate": 7.9604e-06, "loss": -0.0608, "num_tokens": 4929506.0, "reward": 3.7197184562683105, "reward_std": 0.753462016582489, "rewards/reward_fn/mean": 3.7197184562683105, "rewards/reward_fn/std": 0.753462016582489, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1949.0, "completions/max_terminated_length": 1949.0, "completions/mean_length": 374.3125, "completions/mean_terminated_length": 374.3125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.01071390686326509, "frac_reward_zero_std": 1.0, "grad_norm": 0.062255859375, "kl": 0.015362917329184711, "learning_rate": 7.96e-06, "loss": 0.0006, "num_tokens": 4983948.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 286.6875, "completions/mean_terminated_length": 286.6875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.010819985149039991, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.024223619606345892, "learning_rate": 7.959599999999999e-06, "loss": -0.098, "num_tokens": 5024674.0, "reward": 3.859476089477539, "reward_std": 0.5529555678367615, "rewards/reward_fn/mean": 3.859476089477539, "rewards/reward_fn/std": 0.5529556274414062, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 120.125, "completions/mean_terminated_length": 120.125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.010926063434814894, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.017856702557764947, "learning_rate": 7.9592e-06, "loss": 0.0007, "num_tokens": 5048390.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 322.15625, "completions/mean_terminated_length": 322.15625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.011032141720589795, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.019808009383268654, "learning_rate": 7.958799999999999e-06, "loss": 0.0437, "num_tokens": 5113899.0, "reward": 2.8031535148620605, "reward_std": 0.21395450830459595, "rewards/reward_fn/mean": 2.8031535148620605, "rewards/reward_fn/std": 0.21395452320575714, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 196.0, "completions/mean_terminated_length": 196.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.011138220006364698, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.02260214788839221, "learning_rate": 7.9584e-06, "loss": -0.1233, "num_tokens": 5136651.0, "reward": 3.3392744064331055, "reward_std": 0.32503390312194824, "rewards/reward_fn/mean": 3.3392744064331055, "rewards/reward_fn/std": 0.32503387331962585, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 286.78125, "completions/mean_terminated_length": 229.9677276611328, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.011244298292139599, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.022181879496201873, "learning_rate": 7.957999999999999e-06, "loss": 0.1931, "num_tokens": 5181188.0, "reward": 3.0173768997192383, "reward_std": 0.9120379686355591, "rewards/reward_fn/mean": 3.0173768997192383, "rewards/reward_fn/std": 0.9120379686355591, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 236.4375, "completions/mean_terminated_length": 236.4375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.011350376577914502, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.01876313250977546, "learning_rate": 7.9576e-06, "loss": 0.0321, "num_tokens": 5225106.0, "reward": 3.034740924835205, "reward_std": 0.38066956400871277, "rewards/reward_fn/mean": 3.034740924835205, "rewards/reward_fn/std": 0.38066956400871277, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 105.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 72.40625, "completions/mean_terminated_length": 72.40625, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.011456454863689403, "frac_reward_zero_std": 1.0, "grad_norm": 0.318359375, "kl": 0.02579229767434299, "learning_rate": 7.9572e-06, "loss": 0.001, "num_tokens": 5262239.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 1036.21875, "completions/mean_terminated_length": 891.6785888671875, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "epoch": 0.011562533149464305, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.008434178889729083, "learning_rate": 7.9568e-06, "loss": 0.215, "num_tokens": 5321798.0, "reward": 1.9139066934585571, "reward_std": 0.8915742635726929, "rewards/reward_fn/mean": 1.9139066934585571, "rewards/reward_fn/std": 0.8915743231773376, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 330.4375, "completions/mean_terminated_length": 330.4375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.011668611435239206, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.013277154648676515, "learning_rate": 7.9564e-06, "loss": 0.0005, "num_tokens": 5368276.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 427.03125, "completions/mean_terminated_length": 374.7419128417969, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.01177468972101411, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.01534812489990145, "learning_rate": 7.956e-06, "loss": 0.0793, "num_tokens": 5413205.0, "reward": 3.095240592956543, "reward_std": 1.05535888671875, "rewards/reward_fn/mean": 3.095240592956543, "rewards/reward_fn/std": 1.05535888671875, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 293.96875, "completions/mean_terminated_length": 293.96875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.01188076800678901, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.017015320365317166, "learning_rate": 7.955599999999999e-06, "loss": 0.0992, "num_tokens": 5466292.0, "reward": 2.8189077377319336, "reward_std": 0.3456776738166809, "rewards/reward_fn/mean": 2.8189077377319336, "rewards/reward_fn/std": 0.3456777334213257, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 502.71875, "completions/mean_terminated_length": 452.8709411621094, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.011986846292563913, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.012012695078738034, "learning_rate": 7.9552e-06, "loss": 0.2114, "num_tokens": 5520395.0, "reward": 3.8103599548339844, "reward_std": 0.5348771214485168, "rewards/reward_fn/mean": 3.8103599548339844, "rewards/reward_fn/std": 0.5348771810531616, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 134.46875, "completions/mean_terminated_length": 134.46875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.012092924578338814, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.018360009300522506, "learning_rate": 7.954799999999999e-06, "loss": 0.0659, "num_tokens": 5561658.0, "reward": 2.972559690475464, "reward_std": 0.11098479479551315, "rewards/reward_fn/mean": 2.972559690475464, "rewards/reward_fn/std": 0.11098476499319077, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1066.0, "completions/max_terminated_length": 1066.0, "completions/mean_length": 480.03125, "completions/mean_terminated_length": 480.03125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.012199002864113715, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.01176215277519077, "learning_rate": 7.9544e-06, "loss": 0.0591, "num_tokens": 5620795.0, "reward": 2.763178825378418, "reward_std": 0.8988648653030396, "rewards/reward_fn/mean": 2.763178825378418, "rewards/reward_fn/std": 0.8988648653030396, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1965.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 537.3125, "completions/mean_terminated_length": 537.3125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.012305081149888618, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.011784568894654512, "learning_rate": 7.953999999999999e-06, "loss": 0.0149, "num_tokens": 5675109.0, "reward": 2.534766674041748, "reward_std": 0.42342138290405273, "rewards/reward_fn/mean": 2.534766674041748, "rewards/reward_fn/std": 0.42342138290405273, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 521.15625, "completions/mean_terminated_length": 471.9031982421875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.012411159435663519, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.012846595840528607, "learning_rate": 7.9536e-06, "loss": 0.1451, "num_tokens": 5734314.0, "reward": 2.8016586303710938, "reward_std": 0.7866551280021667, "rewards/reward_fn/mean": 2.8016586303710938, "rewards/reward_fn/std": 0.786655068397522, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 257.0625, "completions/mean_terminated_length": 257.0625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.012517237721438422, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.015249886200763285, "learning_rate": 7.953199999999999e-06, "loss": -0.0325, "num_tokens": 5776108.0, "reward": 2.9798991680145264, "reward_std": 0.4480231702327728, "rewards/reward_fn/mean": 2.9798991680145264, "rewards/reward_fn/std": 0.44802314043045044, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 494.28125, "completions/mean_terminated_length": 494.28125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.012623316007213323, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0132115458836779, "learning_rate": 7.9528e-06, "loss": 0.1698, "num_tokens": 5824181.0, "reward": 2.8119354248046875, "reward_std": 0.21195653080940247, "rewards/reward_fn/mean": 2.8119354248046875, "rewards/reward_fn/std": 0.21195654571056366, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 195.0625, "completions/mean_terminated_length": 195.0625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.012729394292988225, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830078125, "kl": 0.017341266619041562, "learning_rate": 7.9524e-06, "loss": 0.0007, "num_tokens": 5861367.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 320.6875, "completions/mean_terminated_length": 264.9677429199219, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.012835472578763127, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.0184562795329839, "learning_rate": 7.952e-06, "loss": 0.3139, "num_tokens": 5906125.0, "reward": 3.835097312927246, "reward_std": 0.7352915406227112, "rewards/reward_fn/mean": 3.835097312927246, "rewards/reward_fn/std": 0.7352915406227112, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 272.3125, "completions/mean_terminated_length": 272.3125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.01294155086453803, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.015007280395366251, "learning_rate": 7.9516e-06, "loss": 0.0006, "num_tokens": 5931671.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 192.625, "completions/mean_terminated_length": 192.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.01304762915031293, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.017203714582137764, "learning_rate": 7.9512e-06, "loss": 0.0007, "num_tokens": 5973867.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 295.5625, "completions/mean_terminated_length": 295.5625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.013153707436087833, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.015582025167532265, "learning_rate": 7.9508e-06, "loss": 0.0483, "num_tokens": 6008669.0, "reward": 3.4704294204711914, "reward_std": 0.8120108246803284, "rewards/reward_fn/mean": 3.4704294204711914, "rewards/reward_fn/std": 0.8120108246803284, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 371.4375, "completions/mean_terminated_length": 371.4375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.013259785721862734, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.01567537139635533, "learning_rate": 7.9504e-06, "loss": 0.275, "num_tokens": 6067755.0, "reward": 3.2702126502990723, "reward_std": 0.5275013446807861, "rewards/reward_fn/mean": 3.2702126502990723, "rewards/reward_fn/std": 0.5275014042854309, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1262.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 288.4375, "completions/mean_terminated_length": 288.4375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.013365864007637637, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.015237941057421267, "learning_rate": 7.95e-06, "loss": -0.0294, "num_tokens": 6137497.0, "reward": 2.747013568878174, "reward_std": 0.3507872223854065, "rewards/reward_fn/mean": 2.747013568878174, "rewards/reward_fn/std": 0.3507872223854065, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 136.0, "completions/mean_terminated_length": 136.0, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.013471942293412538, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.017273012548685074, "learning_rate": 7.9496e-06, "loss": 0.0007, "num_tokens": 6171641.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 364.5, "completions/mean_terminated_length": 310.19354248046875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.01357802057918744, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.016130903968587518, "learning_rate": 7.9492e-06, "loss": 0.2309, "num_tokens": 6221065.0, "reward": 3.2581684589385986, "reward_std": 0.7389606833457947, "rewards/reward_fn/mean": 3.2581684589385986, "rewards/reward_fn/std": 0.7389606833457947, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 158.875, "completions/mean_terminated_length": 158.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.013684098864962342, "frac_reward_zero_std": 1.0, "grad_norm": 0.10595703125, "kl": 0.020700134336948395, "learning_rate": 7.9488e-06, "loss": 0.0008, "num_tokens": 6257413.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 178.09375, "completions/mean_terminated_length": 178.09375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.013790177150737245, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.015618887031450868, "learning_rate": 7.9484e-06, "loss": 0.044, "num_tokens": 6303144.0, "reward": 3.203434944152832, "reward_std": 0.39366769790649414, "rewards/reward_fn/mean": 3.203434944152832, "rewards/reward_fn/std": 0.39366772770881653, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1560.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 206.5625, "completions/mean_terminated_length": 206.5625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.013896255436512146, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.020754429628141224, "learning_rate": 7.948e-06, "loss": 0.0881, "num_tokens": 6349914.0, "reward": 3.9251515865325928, "reward_std": 0.42340630292892456, "rewards/reward_fn/mean": 3.9251515865325928, "rewards/reward_fn/std": 0.42340633273124695, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 586.65625, "completions/mean_terminated_length": 539.51611328125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.014002333722287048, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.016077990061603487, "learning_rate": 7.9476e-06, "loss": 0.1381, "num_tokens": 6405231.0, "reward": 2.754066228866577, "reward_std": 0.8537517189979553, "rewards/reward_fn/mean": 2.754066228866577, "rewards/reward_fn/std": 0.8537517189979553, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 368.90625, "completions/mean_terminated_length": 368.90625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.01410841200806195, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.021284001879394054, "learning_rate": 7.947199999999999e-06, "loss": -0.0086, "num_tokens": 6432524.0, "reward": 2.1652965545654297, "reward_std": 0.8815619945526123, "rewards/reward_fn/mean": 2.1652965545654297, "rewards/reward_fn/std": 0.8815619945526123, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 113.1875, "completions/mean_terminated_length": 113.1875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.014214490293836852, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.020680560497567058, "learning_rate": 7.9468e-06, "loss": 0.0047, "num_tokens": 6471026.0, "reward": 3.8417961597442627, "reward_std": 0.37386056780815125, "rewards/reward_fn/mean": 3.8417961597442627, "rewards/reward_fn/std": 0.37386056780815125, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 262.75, "completions/mean_terminated_length": 262.75, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.014320568579611753, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.023233756190165877, "learning_rate": 7.946399999999999e-06, "loss": 0.0785, "num_tokens": 6510890.0, "reward": 2.3463985919952393, "reward_std": 0.588642418384552, "rewards/reward_fn/mean": 2.3463985919952393, "rewards/reward_fn/std": 0.5886423587799072, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 202.375, "completions/mean_terminated_length": 202.375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.014426646865386656, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474609375, "kl": 0.02503801044076681, "learning_rate": 7.946e-06, "loss": 0.001, "num_tokens": 6554230.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 234.71875, "completions/mean_terminated_length": 234.71875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.014532725151161557, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.024327925639227033, "learning_rate": 7.945599999999999e-06, "loss": 0.0013, "num_tokens": 6596909.0, "reward": 3.922415256500244, "reward_std": 0.3053688704967499, "rewards/reward_fn/mean": 3.922415256500244, "rewards/reward_fn/std": 0.3053688704967499, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 459.625, "completions/mean_terminated_length": 459.625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.01463880343693646, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.018823521910235286, "learning_rate": 7.9452e-06, "loss": -0.0468, "num_tokens": 6642881.0, "reward": 2.581930637359619, "reward_std": 0.4790569543838501, "rewards/reward_fn/mean": 2.581930637359619, "rewards/reward_fn/std": 0.4790569543838501, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1106.0, "completions/max_terminated_length": 1106.0, "completions/mean_length": 382.875, "completions/mean_terminated_length": 382.875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.01474488172271136, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.018572077504359186, "learning_rate": 7.944799999999999e-06, "loss": 0.0251, "num_tokens": 6690941.0, "reward": 2.7523913383483887, "reward_std": 0.3522571325302124, "rewards/reward_fn/mean": 2.7523913383483887, "rewards/reward_fn/std": 0.3522571325302124, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 182.21875, "completions/mean_terminated_length": 182.21875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.014850960008486264, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.030267908703535795, "learning_rate": 7.9444e-06, "loss": 0.0821, "num_tokens": 6738724.0, "reward": 3.928792953491211, "reward_std": 0.28060102462768555, "rewards/reward_fn/mean": 3.928792953491211, "rewards/reward_fn/std": 0.28060105443000793, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 132.21875, "completions/mean_terminated_length": 132.21875, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.014957038294261165, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.024672939674928784, "learning_rate": 7.943999999999999e-06, "loss": 0.0751, "num_tokens": 6791531.0, "reward": 3.088392734527588, "reward_std": 0.07315707951784134, "rewards/reward_fn/mean": 3.088392734527588, "rewards/reward_fn/std": 0.07315707206726074, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 516.71875, "completions/mean_terminated_length": 467.32257080078125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.015063116580036067, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.02176033239811659, "learning_rate": 7.9436e-06, "loss": 0.1231, "num_tokens": 6842882.0, "reward": 2.3468077182769775, "reward_std": 0.5988110303878784, "rewards/reward_fn/mean": 2.3468077182769775, "rewards/reward_fn/std": 0.5988109707832336, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1190.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 387.9375, "completions/mean_terminated_length": 387.9375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.015169194865810968, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.019691342720761895, "learning_rate": 7.9432e-06, "loss": -0.0128, "num_tokens": 6889664.0, "reward": 2.8758623600006104, "reward_std": 0.4385010898113251, "rewards/reward_fn/mean": 2.8758623600006104, "rewards/reward_fn/std": 0.4385010302066803, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 178.21875, "completions/mean_terminated_length": 178.21875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.01527527315158587, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.02775087859481573, "learning_rate": 7.9428e-06, "loss": -0.0002, "num_tokens": 6924935.0, "reward": 3.926584482192993, "reward_std": 0.4153006076812744, "rewards/reward_fn/mean": 3.926584482192993, "rewards/reward_fn/std": 0.4153006076812744, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 266.15625, "completions/mean_terminated_length": 266.15625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.015381351437360772, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.018933446379378438, "learning_rate": 7.9424e-06, "loss": 0.0199, "num_tokens": 6971724.0, "reward": 3.3563008308410645, "reward_std": 0.718804121017456, "rewards/reward_fn/mean": 3.3563008308410645, "rewards/reward_fn/std": 0.7188041806221008, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 217.125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.015487429723135673, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.022704745642840862, "learning_rate": 7.942e-06, "loss": -0.0131, "num_tokens": 7016752.0, "reward": 3.190145492553711, "reward_std": 0.38979148864746094, "rewards/reward_fn/mean": 3.190145492553711, "rewards/reward_fn/std": 0.3897915184497833, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 236.46875, "completions/mean_terminated_length": 236.46875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.015593508008910576, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.02000652300193906, "learning_rate": 7.9416e-06, "loss": 0.0635, "num_tokens": 7061183.0, "reward": 2.778085947036743, "reward_std": 0.029853839427232742, "rewards/reward_fn/mean": 2.778085947036743, "rewards/reward_fn/std": 0.029853837564587593, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 550.25, "completions/mean_terminated_length": 501.9354553222656, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.01569958629468548, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.01610812882427126, "learning_rate": 7.9412e-06, "loss": 0.21, "num_tokens": 7120551.0, "reward": 2.2802534103393555, "reward_std": 0.6225919127464294, "rewards/reward_fn/mean": 2.2802534103393555, "rewards/reward_fn/std": 0.6225919127464294, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 143.875, "completions/mean_terminated_length": 143.875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.015805664580460378, "frac_reward_zero_std": 1.0, "grad_norm": 0.2138671875, "kl": 0.03147526946850121, "learning_rate": 7.9408e-06, "loss": 0.0013, "num_tokens": 7158307.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 154.375, "completions/mean_terminated_length": 154.375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.01591174286623528, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.0251955462154001, "learning_rate": 7.9404e-06, "loss": 0.001, "num_tokens": 7191407.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 313.90625, "completions/mean_terminated_length": 313.90625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.016017821152010184, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.022513336036354303, "learning_rate": 7.94e-06, "loss": 0.091, "num_tokens": 7234572.0, "reward": 3.6232008934020996, "reward_std": 0.6518265008926392, "rewards/reward_fn/mean": 3.6232008934020996, "rewards/reward_fn/std": 0.6518264412879944, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 531.8125, "completions/mean_terminated_length": 482.9031982421875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.016123899437785087, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.020188245456665754, "learning_rate": 7.9396e-06, "loss": 0.2049, "num_tokens": 7288454.0, "reward": 2.797072410583496, "reward_std": 0.6213882565498352, "rewards/reward_fn/mean": 2.797072410583496, "rewards/reward_fn/std": 0.6213882565498352, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 268.0, "completions/mean_terminated_length": 268.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.016229977723559986, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.021411948837339878, "learning_rate": 7.939199999999998e-06, "loss": 0.2013, "num_tokens": 7330694.0, "reward": 3.8916516304016113, "reward_std": 0.44769445061683655, "rewards/reward_fn/mean": 3.8916516304016113, "rewards/reward_fn/std": 0.44769442081451416, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 202.1875, "completions/mean_terminated_length": 202.1875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.01633605600933489, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.02316540782339871, "learning_rate": 7.9388e-06, "loss": 0.0752, "num_tokens": 7365740.0, "reward": 2.775660753250122, "reward_std": 0.2788248360157013, "rewards/reward_fn/mean": 2.775660753250122, "rewards/reward_fn/std": 0.2788248360157013, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 270.6875, "completions/mean_terminated_length": 270.6875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.01644213429510979, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.023652152274735272, "learning_rate": 7.9384e-06, "loss": 0.1038, "num_tokens": 7415170.0, "reward": 3.3111374378204346, "reward_std": 0.525631308555603, "rewards/reward_fn/mean": 3.3111374378204346, "rewards/reward_fn/std": 0.525631308555603, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 357.09375, "completions/mean_terminated_length": 302.5483703613281, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.016548212580884694, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.01592431077733636, "learning_rate": 7.938e-06, "loss": 0.2131, "num_tokens": 7480837.0, "reward": 3.85792875289917, "reward_std": 0.5590457320213318, "rewards/reward_fn/mean": 3.85792875289917, "rewards/reward_fn/std": 0.559045672416687, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 130.0625, "completions/mean_terminated_length": 130.0625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.016654290866659593, "frac_reward_zero_std": 1.0, "grad_norm": 0.11962890625, "kl": 0.018957374268211424, "learning_rate": 7.9376e-06, "loss": 0.0008, "num_tokens": 7512743.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 118.5625, "completions/mean_terminated_length": 118.5625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.016760369152434496, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455078125, "kl": 0.02136626502033323, "learning_rate": 7.9372e-06, "loss": 0.0009, "num_tokens": 7547385.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 287.78125, "completions/mean_terminated_length": 287.78125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.0168664474382094, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.022316091926768422, "learning_rate": 7.9368e-06, "loss": 0.0682, "num_tokens": 7590034.0, "reward": 2.9934346675872803, "reward_std": 0.05143573135137558, "rewards/reward_fn/mean": 2.9934346675872803, "rewards/reward_fn/std": 0.05143576115369797, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 363.21875, "completions/mean_terminated_length": 363.21875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.016972525723984302, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.019301145221106708, "learning_rate": 7.936399999999999e-06, "loss": 0.0424, "num_tokens": 7631033.0, "reward": 2.5955896377563477, "reward_std": 0.4019843637943268, "rewards/reward_fn/mean": 2.5955896377563477, "rewards/reward_fn/std": 0.40198445320129395, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 253.625, "completions/mean_terminated_length": 253.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.0170786040097592, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.023140242788940668, "learning_rate": 7.936e-06, "loss": -0.2224, "num_tokens": 7667917.0, "reward": 1.8273133039474487, "reward_std": 0.1830357313156128, "rewards/reward_fn/mean": 1.8273133039474487, "rewards/reward_fn/std": 0.1830357015132904, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1130.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 290.78125, "completions/mean_terminated_length": 290.78125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.017184682295534104, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.018885845551267266, "learning_rate": 7.935599999999999e-06, "loss": -0.072, "num_tokens": 7708902.0, "reward": 2.6470470428466797, "reward_std": 0.04726897180080414, "rewards/reward_fn/mean": 2.6470470428466797, "rewards/reward_fn/std": 0.04726899042725563, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 121.34375, "completions/mean_terminated_length": 121.34375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.017290760581309007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.014050235971808434, "learning_rate": 7.9352e-06, "loss": 0.0006, "num_tokens": 7754129.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 308.71875, "completions/mean_terminated_length": 308.71875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.01739683886708391, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.015304000582545996, "learning_rate": 7.934799999999999e-06, "loss": -0.0026, "num_tokens": 7799112.0, "reward": 3.7717292308807373, "reward_std": 0.6454416513442993, "rewards/reward_fn/mean": 3.7717292308807373, "rewards/reward_fn/std": 0.6454416513442993, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 252.4375, "completions/mean_terminated_length": 252.4375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.01750291715285881, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.02483147452585399, "learning_rate": 7.9344e-06, "loss": -0.0213, "num_tokens": 7819094.0, "reward": 3.130520820617676, "reward_std": 0.9921421408653259, "rewards/reward_fn/mean": 3.130520820617676, "rewards/reward_fn/std": 0.9921420812606812, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 252.5, "completions/mean_terminated_length": 252.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.01760899543863371, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.022684934083372355, "learning_rate": 7.934e-06, "loss": 0.1033, "num_tokens": 7873542.0, "reward": 2.9395103454589844, "reward_std": 0.25097665190696716, "rewards/reward_fn/mean": 2.9395103454589844, "rewards/reward_fn/std": 0.2509766221046448, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 96.8125, "completions/mean_terminated_length": 96.8125, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.017715073724408614, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.020881250500679016, "learning_rate": 7.9336e-06, "loss": 0.0008, "num_tokens": 7918592.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 353.75, "completions/mean_terminated_length": 353.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.017821152010183517, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.01776517287362367, "learning_rate": 7.9332e-06, "loss": 0.2052, "num_tokens": 7965880.0, "reward": 2.8560633659362793, "reward_std": 0.4875142276287079, "rewards/reward_fn/mean": 2.8560633659362793, "rewards/reward_fn/std": 0.4875142276287079, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 256.875, "completions/mean_terminated_length": 256.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.017927230295958416, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.022131944191642106, "learning_rate": 7.9328e-06, "loss": -0.0649, "num_tokens": 8018580.0, "reward": 3.7245075702667236, "reward_std": 0.8427301645278931, "rewards/reward_fn/mean": 3.7245075702667236, "rewards/reward_fn/std": 0.8427301645278931, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 441.9375, "completions/mean_terminated_length": 441.9375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.01803330858173332, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.01810378080699593, "learning_rate": 7.9324e-06, "loss": -0.0885, "num_tokens": 8068722.0, "reward": 2.511117458343506, "reward_std": 0.7322016954421997, "rewards/reward_fn/mean": 2.511117458343506, "rewards/reward_fn/std": 0.7322016954421997, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1183.0, "completions/max_terminated_length": 1183.0, "completions/mean_length": 209.9375, "completions/mean_terminated_length": 209.9375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.018139386867508222, "frac_reward_zero_std": 0.0, "grad_norm": 2.484375, "kl": 0.024077138165012002, "learning_rate": 7.932e-06, "loss": -0.0054, "num_tokens": 8116656.0, "reward": 3.855273723602295, "reward_std": 0.48421603441238403, "rewards/reward_fn/mean": 3.855273723602295, "rewards/reward_fn/std": 0.48421603441238403, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 135.625, "completions/mean_terminated_length": 135.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.01824546515328312, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.025709635578095913, "learning_rate": 7.9316e-06, "loss": 0.001, "num_tokens": 8152516.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 241.5, "completions/mean_terminated_length": 241.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.018351543439058024, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.02147727902047336, "learning_rate": 7.9312e-06, "loss": -0.0254, "num_tokens": 8200052.0, "reward": 3.6303017139434814, "reward_std": 0.5629006028175354, "rewards/reward_fn/mean": 3.6303017139434814, "rewards/reward_fn/std": 0.5629005432128906, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 215.0625, "completions/mean_terminated_length": 215.0625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.018457621724832927, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.015550489188171923, "learning_rate": 7.930799999999999e-06, "loss": -0.0056, "num_tokens": 8245206.0, "reward": 3.894169330596924, "reward_std": 0.4359620213508606, "rewards/reward_fn/mean": 3.894169330596924, "rewards/reward_fn/std": 0.4359620213508606, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 182.34375, "completions/mean_terminated_length": 182.34375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.01856370001060783, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.018534490489400923, "learning_rate": 7.9304e-06, "loss": -0.0642, "num_tokens": 8285089.0, "reward": 3.3171892166137695, "reward_std": 0.22977031767368317, "rewards/reward_fn/mean": 3.3171892166137695, "rewards/reward_fn/std": 0.2297702431678772, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 158.03125, "completions/mean_terminated_length": 158.03125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.01866977829638273, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.01954884792212397, "learning_rate": 7.929999999999999e-06, "loss": -0.029, "num_tokens": 8314050.0, "reward": 3.89382266998291, "reward_std": 0.33542728424072266, "rewards/reward_fn/mean": 3.89382266998291, "rewards/reward_fn/std": 0.33542731404304504, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 82.28125, "completions/mean_terminated_length": 82.28125, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.01877585658215763, "frac_reward_zero_std": 1.0, "grad_norm": 0.1689453125, "kl": 0.02278315497096628, "learning_rate": 7.9296e-06, "loss": 0.0009, "num_tokens": 8348715.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 213.3125, "completions/mean_terminated_length": 213.3125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.018881934867932534, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.020352299557998776, "learning_rate": 7.9292e-06, "loss": 0.0348, "num_tokens": 8389077.0, "reward": 3.964296340942383, "reward_std": 0.2019711285829544, "rewards/reward_fn/mean": 3.964296340942383, "rewards/reward_fn/std": 0.2019711583852768, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 245.90625, "completions/mean_terminated_length": 245.90625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.018988013153707437, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.01580990757793188, "learning_rate": 7.9288e-06, "loss": -0.0487, "num_tokens": 8435986.0, "reward": 3.861480712890625, "reward_std": 0.545066773891449, "rewards/reward_fn/mean": 3.861480712890625, "rewards/reward_fn/std": 0.545066773891449, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 145.4375, "completions/mean_terminated_length": 145.4375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.019094091439482336, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.01902392355259508, "learning_rate": 7.9284e-06, "loss": 0.0008, "num_tokens": 8471936.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 184.25, "completions/mean_terminated_length": 184.25, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.01920016972525724, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.022345099016092718, "learning_rate": 7.928e-06, "loss": -0.0118, "num_tokens": 8517768.0, "reward": 3.9072141647338867, "reward_std": 0.2347065657377243, "rewards/reward_fn/mean": 3.9072141647338867, "rewards/reward_fn/std": 0.2347065508365631, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 170.59375, "completions/mean_terminated_length": 170.59375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.019306248011032142, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.022710592485964298, "learning_rate": 7.9276e-06, "loss": -0.0182, "num_tokens": 8545371.0, "reward": 3.7901389598846436, "reward_std": 0.6630701422691345, "rewards/reward_fn/mean": 3.7901389598846436, "rewards/reward_fn/std": 0.6630700826644897, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1487.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 465.59375, "completions/mean_terminated_length": 465.59375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.019412326296807045, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.017950538313016295, "learning_rate": 7.9272e-06, "loss": 0.1813, "num_tokens": 8598190.0, "reward": 2.7570290565490723, "reward_std": 0.33804285526275635, "rewards/reward_fn/mean": 2.7570290565490723, "rewards/reward_fn/std": 0.33804285526275635, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 183.375, "completions/mean_terminated_length": 183.375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.019518404582581944, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.021848752396181226, "learning_rate": 7.9268e-06, "loss": -0.0899, "num_tokens": 8644410.0, "reward": 3.5412044525146484, "reward_std": 0.5812621116638184, "rewards/reward_fn/mean": 3.5412044525146484, "rewards/reward_fn/std": 0.5812621116638184, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 315.0, "completions/mean_terminated_length": 315.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.019624482868356847, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.017826329451054335, "learning_rate": 7.9264e-06, "loss": 0.0526, "num_tokens": 8683802.0, "reward": 2.4369404315948486, "reward_std": 0.47755300998687744, "rewards/reward_fn/mean": 2.4369404315948486, "rewards/reward_fn/std": 0.47755295038223267, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 287.75, "completions/mean_terminated_length": 287.75, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.01973056115413175, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.015661925077438354, "learning_rate": 7.926e-06, "loss": -0.0674, "num_tokens": 8740626.0, "reward": 2.8808302879333496, "reward_std": 0.07162141054868698, "rewards/reward_fn/mean": 2.8808302879333496, "rewards/reward_fn/std": 0.07162139564752579, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 237.90625, "completions/mean_terminated_length": 237.90625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.019836639439906652, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.025629171170294285, "learning_rate": 7.925599999999999e-06, "loss": 0.0427, "num_tokens": 8785615.0, "reward": 3.9353818893432617, "reward_std": 0.25444263219833374, "rewards/reward_fn/mean": 3.9353818893432617, "rewards/reward_fn/std": 0.25444263219833374, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 217.75, "completions/mean_terminated_length": 217.75, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.019942717725681552, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.023751557571813464, "learning_rate": 7.9252e-06, "loss": 0.0619, "num_tokens": 8826823.0, "reward": 3.165792465209961, "reward_std": 0.08262227475643158, "rewards/reward_fn/mean": 3.165792465209961, "rewards/reward_fn/std": 0.08262225985527039, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 357.375, "completions/mean_terminated_length": 357.375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.020048796011456455, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.014718591351993382, "learning_rate": 7.9248e-06, "loss": -0.0545, "num_tokens": 8860371.0, "reward": 3.8873002529144287, "reward_std": 0.35631558299064636, "rewards/reward_fn/mean": 3.8873002529144287, "rewards/reward_fn/std": 0.35631558299064636, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 393.5625, "completions/mean_terminated_length": 393.5625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.020154874297231357, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.01423095993231982, "learning_rate": 7.9244e-06, "loss": 0.0451, "num_tokens": 8908069.0, "reward": 2.73614239692688, "reward_std": 0.17838501930236816, "rewards/reward_fn/mean": 2.73614239692688, "rewards/reward_fn/std": 0.17838500440120697, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1880.0, "completions/max_terminated_length": 1880.0, "completions/mean_length": 583.5625, "completions/mean_terminated_length": 583.5625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.02026095258300626, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.014432973344810307, "learning_rate": 7.924e-06, "loss": -0.0538, "num_tokens": 8964215.0, "reward": 2.671628713607788, "reward_std": 0.659324586391449, "rewards/reward_fn/mean": 2.671628713607788, "rewards/reward_fn/std": 0.659324586391449, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 149.53125, "completions/mean_terminated_length": 149.53125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.02036703086878116, "frac_reward_zero_std": 1.0, "grad_norm": 0.09130859375, "kl": 0.016666988376528025, "learning_rate": 7.9236e-06, "loss": 0.0007, "num_tokens": 9010280.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 169.75, "completions/mean_terminated_length": 169.75, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.020473109154556062, "frac_reward_zero_std": 1.0, "grad_norm": 0.0888671875, "kl": 0.022206315770745277, "learning_rate": 7.923199999999999e-06, "loss": 0.0009, "num_tokens": 9037280.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 176.96875, "completions/mean_terminated_length": 176.96875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.020579187440330965, "frac_reward_zero_std": 1.0, "grad_norm": 0.11279296875, "kl": 0.023108911118470132, "learning_rate": 7.9228e-06, "loss": 0.0009, "num_tokens": 9063647.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 92.46875, "completions/mean_terminated_length": 92.46875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.020685265726105868, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.022209690301679075, "learning_rate": 7.922399999999999e-06, "loss": 0.0086, "num_tokens": 9101102.0, "reward": 3.0705783367156982, "reward_std": 0.04061302915215492, "rewards/reward_fn/mean": 3.0705783367156982, "rewards/reward_fn/std": 0.04061301052570343, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1076.0, "completions/max_terminated_length": 1076.0, "completions/mean_length": 302.21875, "completions/mean_terminated_length": 302.21875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.020791344011880767, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.019014439545571804, "learning_rate": 7.922e-06, "loss": 0.0131, "num_tokens": 9131189.0, "reward": 3.8880209922790527, "reward_std": 0.35388144850730896, "rewards/reward_fn/mean": 3.8880209922790527, "rewards/reward_fn/std": 0.3538813889026642, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 282.78125, "completions/mean_terminated_length": 282.78125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.02089742229765567, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.01936782174743712, "learning_rate": 7.921599999999999e-06, "loss": 0.0349, "num_tokens": 9180334.0, "reward": 2.780134677886963, "reward_std": 0.2310194969177246, "rewards/reward_fn/mean": 2.780134677886963, "rewards/reward_fn/std": 0.231019526720047, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 155.0, "completions/max_terminated_length": 155.0, "completions/mean_length": 101.21875, "completions/mean_terminated_length": 101.21875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.021003500583430573, "frac_reward_zero_std": 1.0, "grad_norm": 0.181640625, "kl": 0.0338819632306695, "learning_rate": 7.9212e-06, "loss": 0.0014, "num_tokens": 9226933.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 237.3125, "completions/mean_terminated_length": 237.3125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.021109578869205475, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.0204662608448416, "learning_rate": 7.920799999999999e-06, "loss": 0.0692, "num_tokens": 9264927.0, "reward": 3.256192207336426, "reward_std": 0.5834986567497253, "rewards/reward_fn/mean": 3.256192207336426, "rewards/reward_fn/std": 0.5834985971450806, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 245.125, "completions/mean_terminated_length": 245.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.021215657154980375, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.021983513375744224, "learning_rate": 7.9204e-06, "loss": 0.0059, "num_tokens": 9310723.0, "reward": 3.7263994216918945, "reward_std": 0.7355256080627441, "rewards/reward_fn/mean": 3.7263994216918945, "rewards/reward_fn/std": 0.7355256080627441, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1724.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 385.3125, "completions/mean_terminated_length": 385.3125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.021321735440755277, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.017621309030801058, "learning_rate": 7.92e-06, "loss": -0.0468, "num_tokens": 9355341.0, "reward": 3.0219438076019287, "reward_std": 0.6457244157791138, "rewards/reward_fn/mean": 3.0219438076019287, "rewards/reward_fn/std": 0.6457244157791138, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1825.0, "completions/max_terminated_length": 1825.0, "completions/mean_length": 460.375, "completions/mean_terminated_length": 460.375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.02142781372653018, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.017469807295128703, "learning_rate": 7.9196e-06, "loss": 0.0556, "num_tokens": 9411481.0, "reward": 2.799130439758301, "reward_std": 0.6681373715400696, "rewards/reward_fn/mean": 2.799130439758301, "rewards/reward_fn/std": 0.6681373715400696, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1390.0, "completions/max_terminated_length": 1390.0, "completions/mean_length": 471.875, "completions/mean_terminated_length": 471.875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.02153389201230508, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.014571573003195226, "learning_rate": 7.9192e-06, "loss": -0.0191, "num_tokens": 9467445.0, "reward": 2.55147647857666, "reward_std": 0.5883057117462158, "rewards/reward_fn/mean": 2.55147647857666, "rewards/reward_fn/std": 0.5883057117462158, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 377.40625, "completions/mean_terminated_length": 377.40625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.021639970298079982, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.020812442991882563, "learning_rate": 7.9188e-06, "loss": 0.0307, "num_tokens": 9511202.0, "reward": 3.7836437225341797, "reward_std": 0.42160654067993164, "rewards/reward_fn/mean": 3.7836437225341797, "rewards/reward_fn/std": 0.4216066002845764, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 341.8125, "completions/mean_terminated_length": 341.8125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.021746048583854885, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.019275061087682843, "learning_rate": 7.9184e-06, "loss": 0.009, "num_tokens": 9543420.0, "reward": 3.574246644973755, "reward_std": 0.8010706901550293, "rewards/reward_fn/mean": 3.574246644973755, "rewards/reward_fn/std": 0.8010706901550293, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 359.125, "completions/mean_terminated_length": 359.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.021852126869629788, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.020105792675167322, "learning_rate": 7.918e-06, "loss": 0.0532, "num_tokens": 9602560.0, "reward": 3.616807460784912, "reward_std": 0.6793028712272644, "rewards/reward_fn/mean": 3.616807460784912, "rewards/reward_fn/std": 0.6793028116226196, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 326.9375, "completions/mean_terminated_length": 326.9375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.021958205155404687, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.0190516859292984, "learning_rate": 7.9176e-06, "loss": 0.0137, "num_tokens": 9643486.0, "reward": 3.9254260063171387, "reward_std": 0.4218538701534271, "rewards/reward_fn/mean": 3.9254260063171387, "rewards/reward_fn/std": 0.42185384035110474, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1806.0, "completions/max_terminated_length": 1806.0, "completions/mean_length": 585.125, "completions/mean_terminated_length": 585.125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.02206428344117959, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.013375958427786827, "learning_rate": 7.9172e-06, "loss": -0.0082, "num_tokens": 9693762.0, "reward": 3.8153269290924072, "reward_std": 0.6042912602424622, "rewards/reward_fn/mean": 3.8153269290924072, "rewards/reward_fn/std": 0.6042913198471069, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 222.5625, "completions/mean_terminated_length": 222.5625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.022170361726954493, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.021646051667630672, "learning_rate": 7.9168e-06, "loss": -0.1015, "num_tokens": 9732404.0, "reward": 3.488537311553955, "reward_std": 0.6231763958930969, "rewards/reward_fn/mean": 3.488537311553955, "rewards/reward_fn/std": 0.6231764554977417, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1840.0, "completions/mean_length": 450.4375, "completions/mean_terminated_length": 398.9032287597656, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.022276440012729395, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.01734934072010219, "learning_rate": 7.9164e-06, "loss": 0.0922, "num_tokens": 9783682.0, "reward": 3.4365906715393066, "reward_std": 1.0417983531951904, "rewards/reward_fn/mean": 3.4365906715393066, "rewards/reward_fn/std": 1.0417983531951904, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 106.09375, "completions/mean_terminated_length": 106.09375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.022382518298504295, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.02026362717151642, "learning_rate": 7.916e-06, "loss": 0.0008, "num_tokens": 9824517.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 200.875, "completions/mean_terminated_length": 200.875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.022488596584279198, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.021493054926395416, "learning_rate": 7.9156e-06, "loss": -0.0043, "num_tokens": 9866497.0, "reward": 3.8106346130371094, "reward_std": 0.6165704727172852, "rewards/reward_fn/mean": 3.8106346130371094, "rewards/reward_fn/std": 0.6165704131126404, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 247.1875, "completions/mean_terminated_length": 247.1875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.0225946748700541, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.022569065680727363, "learning_rate": 7.9152e-06, "loss": 0.0482, "num_tokens": 9910567.0, "reward": 3.931674003601074, "reward_std": 0.38650935888290405, "rewards/reward_fn/mean": 3.931674003601074, "rewards/reward_fn/std": 0.3865092992782593, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1417.0, "completions/max_terminated_length": 1417.0, "completions/mean_length": 516.78125, "completions/mean_terminated_length": 516.78125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.022700753155829003, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.015229119802825153, "learning_rate": 7.9148e-06, "loss": -0.001, "num_tokens": 9961536.0, "reward": 2.6889023780822754, "reward_std": 0.4305468201637268, "rewards/reward_fn/mean": 2.6889023780822754, "rewards/reward_fn/std": 0.4305467903614044, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 286.71875, "completions/mean_terminated_length": 286.71875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.022806831441603902, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.019226033822633326, "learning_rate": 7.9144e-06, "loss": -0.0184, "num_tokens": 10009143.0, "reward": 3.134714126586914, "reward_std": 0.25494593381881714, "rewards/reward_fn/mean": 3.134714126586914, "rewards/reward_fn/std": 0.25494590401649475, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1603.0, "completions/max_terminated_length": 1603.0, "completions/mean_length": 340.3125, "completions/mean_terminated_length": 340.3125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.022912909727378805, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.02126729814335704, "learning_rate": 7.913999999999999e-06, "loss": -0.0127, "num_tokens": 10058369.0, "reward": 3.2720999717712402, "reward_std": 0.5431775450706482, "rewards/reward_fn/mean": 3.2720999717712402, "rewards/reward_fn/std": 0.543177604675293, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1504.0, "completions/max_terminated_length": 1504.0, "completions/mean_length": 413.90625, "completions/mean_terminated_length": 413.90625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.023018988013153708, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.016389940166845918, "learning_rate": 7.9136e-06, "loss": -0.0359, "num_tokens": 10107422.0, "reward": 1.7760989665985107, "reward_std": 0.02450552210211754, "rewards/reward_fn/mean": 1.7760989665985107, "rewards/reward_fn/std": 0.0245054978877306, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 293.40625, "completions/mean_terminated_length": 293.40625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.02312506629892861, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.023776356363669038, "learning_rate": 7.913199999999999e-06, "loss": 0.0182, "num_tokens": 10158091.0, "reward": 2.7377381324768066, "reward_std": 0.02857634611427784, "rewards/reward_fn/mean": 2.7377381324768066, "rewards/reward_fn/std": 0.028576355427503586, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1972.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 523.0, "completions/mean_terminated_length": 523.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.02323114458470351, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.021222119219601154, "learning_rate": 7.9128e-06, "loss": 0.0815, "num_tokens": 10225227.0, "reward": 2.895890951156616, "reward_std": 1.194340705871582, "rewards/reward_fn/mean": 2.895890951156616, "rewards/reward_fn/std": 1.194340705871582, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 268.84375, "completions/mean_terminated_length": 268.84375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.023337222870478413, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.02598167071118951, "learning_rate": 7.912399999999999e-06, "loss": 0.0997, "num_tokens": 10265094.0, "reward": 3.1712889671325684, "reward_std": 0.5284618735313416, "rewards/reward_fn/mean": 3.1712889671325684, "rewards/reward_fn/std": 0.5284618735313416, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1131.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 407.21875, "completions/mean_terminated_length": 407.21875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.023443301156253316, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.01953859266359359, "learning_rate": 7.912e-06, "loss": 0.0176, "num_tokens": 10315277.0, "reward": 2.7439894676208496, "reward_std": 0.05666489899158478, "rewards/reward_fn/mean": 2.7439894676208496, "rewards/reward_fn/std": 0.056664880365133286, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 309.15625, "completions/mean_terminated_length": 309.15625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.02354937944202822, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.021818128880113363, "learning_rate": 7.911599999999999e-06, "loss": 0.0998, "num_tokens": 10357138.0, "reward": 3.8890299797058105, "reward_std": 0.3507172167301178, "rewards/reward_fn/mean": 3.8890299797058105, "rewards/reward_fn/std": 0.3507172167301178, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 281.21875, "completions/mean_terminated_length": 281.21875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.023655457727803118, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.02001171070151031, "learning_rate": 7.9112e-06, "loss": 0.0008, "num_tokens": 10401177.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 338.4375, "completions/mean_terminated_length": 338.4375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.02376153601357802, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.01955355703830719, "learning_rate": 7.910799999999999e-06, "loss": 0.0121, "num_tokens": 10446823.0, "reward": 3.107459306716919, "reward_std": 0.5370301604270935, "rewards/reward_fn/mean": 3.107459306716919, "rewards/reward_fn/std": 0.5370301604270935, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 256.90625, "completions/mean_terminated_length": 256.90625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.023867614299352923, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.02468311577104032, "learning_rate": 7.9104e-06, "loss": 0.2088, "num_tokens": 10490596.0, "reward": 3.9355549812316895, "reward_std": 0.25359582901000977, "rewards/reward_fn/mean": 3.9355549812316895, "rewards/reward_fn/std": 0.25359582901000977, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1500.0, "completions/max_terminated_length": 1500.0, "completions/mean_length": 559.53125, "completions/mean_terminated_length": 559.53125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.023973692585127826, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.01560274779330939, "learning_rate": 7.91e-06, "loss": 0.1691, "num_tokens": 10572181.0, "reward": 2.4835715293884277, "reward_std": 0.4638085961341858, "rewards/reward_fn/mean": 2.4835715293884277, "rewards/reward_fn/std": 0.4638086259365082, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2047.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 488.25, "completions/mean_terminated_length": 488.25, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.024079770870902725, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.021264664246700704, "learning_rate": 7.9096e-06, "loss": -0.0982, "num_tokens": 10619101.0, "reward": 2.9061059951782227, "reward_std": 0.22990155220031738, "rewards/reward_fn/mean": 2.9061059951782227, "rewards/reward_fn/std": 0.22990158200263977, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 547.8125, "completions/mean_terminated_length": 547.8125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.024185849156677628, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.023016543593257666, "learning_rate": 7.9092e-06, "loss": 0.0536, "num_tokens": 10666135.0, "reward": 2.741976022720337, "reward_std": 0.5150967240333557, "rewards/reward_fn/mean": 2.741976022720337, "rewards/reward_fn/std": 0.5150967836380005, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 200.71875, "completions/mean_terminated_length": 200.71875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.02429192744245253, "frac_reward_zero_std": 1.0, "grad_norm": 0.1728515625, "kl": 0.03538187500089407, "learning_rate": 7.9088e-06, "loss": 0.0014, "num_tokens": 10700782.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1188.0, "completions/max_terminated_length": 1188.0, "completions/mean_length": 214.90625, "completions/mean_terminated_length": 214.90625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.02439800572822743, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.033329138765111566, "learning_rate": 7.9084e-06, "loss": -0.0488, "num_tokens": 10753611.0, "reward": 3.9345145225524902, "reward_std": 0.1547694057226181, "rewards/reward_fn/mean": 3.9345145225524902, "rewards/reward_fn/std": 0.1547694057226181, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 139.625, "completions/mean_terminated_length": 139.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.024504084014002333, "frac_reward_zero_std": 1.0, "grad_norm": 0.212890625, "kl": 0.038340474013239145, "learning_rate": 7.908e-06, "loss": 0.0015, "num_tokens": 10798399.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1771.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 558.59375, "completions/mean_terminated_length": 558.59375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.024610162299777236, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.018731832038611174, "learning_rate": 7.9076e-06, "loss": 0.0969, "num_tokens": 10832626.0, "reward": 2.5503268241882324, "reward_std": 0.4429418444633484, "rewards/reward_fn/mean": 2.5503268241882324, "rewards/reward_fn/std": 0.4429418444633484, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 225.78125, "completions/mean_terminated_length": 225.78125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.02471624058555214, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.023959350073710084, "learning_rate": 7.9072e-06, "loss": 0.0026, "num_tokens": 10879819.0, "reward": 2.9614815711975098, "reward_std": 0.45102646946907043, "rewards/reward_fn/mean": 2.9614815711975098, "rewards/reward_fn/std": 0.4510264992713928, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 161.75, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.024822318871327038, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.027287997072562575, "learning_rate": 7.906799999999999e-06, "loss": 0.0649, "num_tokens": 10918083.0, "reward": 3.9702796936035156, "reward_std": 0.16812357306480408, "rewards/reward_fn/mean": 3.9702796936035156, "rewards/reward_fn/std": 0.16812357306480408, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 264.8125, "completions/mean_terminated_length": 264.8125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.02492839715710194, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.026635492919012904, "learning_rate": 7.9064e-06, "loss": -0.0229, "num_tokens": 10964541.0, "reward": 3.613354206085205, "reward_std": 0.5796034336090088, "rewards/reward_fn/mean": 3.613354206085205, "rewards/reward_fn/std": 0.5796034336090088, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1248.0, "completions/max_terminated_length": 1248.0, "completions/mean_length": 338.46875, "completions/mean_terminated_length": 338.46875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.025034475442876843, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.021990014938637614, "learning_rate": 7.905999999999999e-06, "loss": -0.0474, "num_tokens": 11022316.0, "reward": 3.9591715335845947, "reward_std": 0.2309606820344925, "rewards/reward_fn/mean": 3.9591715335845947, "rewards/reward_fn/std": 0.2309606820344925, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 188.90625, "completions/mean_terminated_length": 188.90625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.025140553728651746, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.021487332647666335, "learning_rate": 7.9056e-06, "loss": 0.0009, "num_tokens": 11049097.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 217.96875, "completions/mean_terminated_length": 217.96875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.025246632014426645, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.023882502922788262, "learning_rate": 7.9052e-06, "loss": 0.001, "num_tokens": 11109512.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1533.0, "completions/mean_length": 655.53125, "completions/mean_terminated_length": 610.6129150390625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.025352710300201548, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.01844685070682317, "learning_rate": 7.9048e-06, "loss": 0.1706, "num_tokens": 11146073.0, "reward": 1.8283706903457642, "reward_std": 0.5581537485122681, "rewards/reward_fn/mean": 1.8283706903457642, "rewards/reward_fn/std": 0.5581536889076233, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 987.75, "completions/mean_terminated_length": 878.0689697265625, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "epoch": 0.02545878858597645, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.012791063985787332, "learning_rate": 7.9044e-06, "loss": 0.2083, "num_tokens": 11222033.0, "reward": 2.3440942764282227, "reward_std": 0.8046829700469971, "rewards/reward_fn/mean": 2.3440942764282227, "rewards/reward_fn/std": 0.8046829700469971, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 244.0625, "completions/mean_terminated_length": 244.0625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.025564866871751354, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.02614483702927828, "learning_rate": 7.904e-06, "loss": 0.0013, "num_tokens": 11268531.0, "reward": 2.8031463623046875, "reward_std": 0.05802328139543533, "rewards/reward_fn/mean": 2.8031463623046875, "rewards/reward_fn/std": 0.05802330747246742, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 230.8125, "completions/mean_terminated_length": 230.8125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.025670945157526253, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.016728723421692848, "learning_rate": 7.9036e-06, "loss": 0.032, "num_tokens": 11311533.0, "reward": 2.816776752471924, "reward_std": 0.21942509710788727, "rewards/reward_fn/mean": 2.816776752471924, "rewards/reward_fn/std": 0.21942508220672607, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 285.65625, "completions/mean_terminated_length": 285.65625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.025777023443301156, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.016763758845627308, "learning_rate": 7.903199999999999e-06, "loss": 0.0875, "num_tokens": 11335810.0, "reward": 3.3429794311523438, "reward_std": 0.5541702508926392, "rewards/reward_fn/mean": 3.3429794311523438, "rewards/reward_fn/std": 0.5541702508926392, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 222.65625, "completions/mean_terminated_length": 222.65625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.02588310172907606, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.02172936638817191, "learning_rate": 7.9028e-06, "loss": -0.0466, "num_tokens": 11380663.0, "reward": 3.145770788192749, "reward_std": 0.5794350504875183, "rewards/reward_fn/mean": 3.145770788192749, "rewards/reward_fn/std": 0.5794350504875183, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 191.0, "completions/max_terminated_length": 191.0, "completions/mean_length": 131.8125, "completions/mean_terminated_length": 131.8125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.02598918001485096, "frac_reward_zero_std": 0.0, "grad_norm": 3.421875, "kl": 0.019724910031072795, "learning_rate": 7.902399999999999e-06, "loss": 0.0672, "num_tokens": 11414193.0, "reward": 3.9744322299957275, "reward_std": 0.1446334570646286, "rewards/reward_fn/mean": 3.9744322299957275, "rewards/reward_fn/std": 0.1446334421634674, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 425.40625, "completions/mean_terminated_length": 425.40625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.02609525830062586, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.015981771517544985, "learning_rate": 7.902e-06, "loss": -0.0546, "num_tokens": 11467390.0, "reward": 2.7777838706970215, "reward_std": 0.47854653000831604, "rewards/reward_fn/mean": 2.7777838706970215, "rewards/reward_fn/std": 0.47854653000831604, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 342.6875, "completions/mean_terminated_length": 342.6875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.026201336586400763, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.017373082577250898, "learning_rate": 7.901599999999999e-06, "loss": -0.0624, "num_tokens": 11513652.0, "reward": 2.9905588626861572, "reward_std": 0.4717329740524292, "rewards/reward_fn/mean": 2.9905588626861572, "rewards/reward_fn/std": 0.4717329442501068, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 108.5625, "completions/mean_terminated_length": 108.5625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.026307414872175666, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.02236648928374052, "learning_rate": 7.9012e-06, "loss": 0.0009, "num_tokens": 11534086.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 269.40625, "completions/mean_terminated_length": 269.40625, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.02641349315795057, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.022781465435400605, "learning_rate": 7.9008e-06, "loss": 0.0046, "num_tokens": 11575219.0, "reward": 2.9033608436584473, "reward_std": 0.2065192610025406, "rewards/reward_fn/mean": 2.9033608436584473, "rewards/reward_fn/std": 0.2065192610025406, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 749.75, "completions/mean_terminated_length": 707.8709716796875, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.02651957144372547, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.011985445278696716, "learning_rate": 7.9004e-06, "loss": 0.1933, "num_tokens": 11636395.0, "reward": 2.566638469696045, "reward_std": 0.4702640473842621, "rewards/reward_fn/mean": 2.566638469696045, "rewards/reward_fn/std": 0.4702640473842621, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 280.78125, "completions/mean_terminated_length": 280.78125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.02662564972950037, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.01918622641824186, "learning_rate": 7.9e-06, "loss": 0.1319, "num_tokens": 11663652.0, "reward": 3.4576807022094727, "reward_std": 0.5513618588447571, "rewards/reward_fn/mean": 3.4576807022094727, "rewards/reward_fn/std": 0.5513618588447571, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 303.875, "completions/mean_terminated_length": 303.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.026731728015275274, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.02559892018325627, "learning_rate": 7.8996e-06, "loss": 0.0889, "num_tokens": 11714272.0, "reward": 3.407193660736084, "reward_std": 0.6705021262168884, "rewards/reward_fn/mean": 3.407193660736084, "rewards/reward_fn/std": 0.6705020666122437, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 969.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 320.625, "completions/mean_terminated_length": 320.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.026837806301050177, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.01899566757492721, "learning_rate": 7.8992e-06, "loss": 0.1479, "num_tokens": 11761556.0, "reward": 3.74670672416687, "reward_std": 0.7026734948158264, "rewards/reward_fn/mean": 3.74670672416687, "rewards/reward_fn/std": 0.7026734352111816, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 545.0, "completions/mean_terminated_length": 545.0, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.026943884586825076, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.013679098337888718, "learning_rate": 7.8988e-06, "loss": 0.038, "num_tokens": 11811060.0, "reward": 3.042778491973877, "reward_std": 0.6887111067771912, "rewards/reward_fn/mean": 3.042778491973877, "rewards/reward_fn/std": 0.6887110471725464, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 228.78125, "completions/mean_terminated_length": 228.78125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.02704996287259998, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.031039110152050853, "learning_rate": 7.898399999999999e-06, "loss": 0.1049, "num_tokens": 11849101.0, "reward": 3.9631972312927246, "reward_std": 0.20818859338760376, "rewards/reward_fn/mean": 3.9631972312927246, "rewards/reward_fn/std": 0.20818862318992615, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1227.0, "completions/max_terminated_length": 1227.0, "completions/mean_length": 468.09375, "completions/mean_terminated_length": 468.09375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.02715604115837488, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.017622511251829565, "learning_rate": 7.898e-06, "loss": 0.069, "num_tokens": 11912592.0, "reward": 3.587637186050415, "reward_std": 0.6166492700576782, "rewards/reward_fn/mean": 3.587637186050415, "rewards/reward_fn/std": 0.616649329662323, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 191.5625, "completions/mean_terminated_length": 191.5625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.027262119444149784, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.02548597170971334, "learning_rate": 7.897599999999999e-06, "loss": -0.0541, "num_tokens": 11946626.0, "reward": 3.064663887023926, "reward_std": 0.07985293865203857, "rewards/reward_fn/mean": 3.064663887023926, "rewards/reward_fn/std": 0.07985293865203857, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 253.1875, "completions/mean_terminated_length": 253.1875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.027368197729924684, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.021782919066026807, "learning_rate": 7.8972e-06, "loss": 0.1768, "num_tokens": 11987752.0, "reward": 3.9413747787475586, "reward_std": 0.23125647008419037, "rewards/reward_fn/mean": 3.9413747787475586, "rewards/reward_fn/std": 0.23125645518302917, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 218.46875, "completions/mean_terminated_length": 218.46875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.027474276015699586, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.024505440145730972, "learning_rate": 7.896799999999999e-06, "loss": 0.001, "num_tokens": 12025047.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1152.0, "completions/max_terminated_length": 1152.0, "completions/mean_length": 329.96875, "completions/mean_terminated_length": 329.96875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.02758035430147449, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.018241875804960728, "learning_rate": 7.8964e-06, "loss": 0.0007, "num_tokens": 12068118.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 265.40625, "completions/mean_terminated_length": 265.40625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.02768643258724939, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.02741927863098681, "learning_rate": 7.896e-06, "loss": 0.0059, "num_tokens": 12113347.0, "reward": 3.966427803039551, "reward_std": 0.18991301953792572, "rewards/reward_fn/mean": 3.966427803039551, "rewards/reward_fn/std": 0.18991298973560333, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 277.5625, "completions/mean_terminated_length": 277.5625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.02779251087302429, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.021280562039464712, "learning_rate": 7.8956e-06, "loss": 0.0978, "num_tokens": 12151765.0, "reward": 2.54994535446167, "reward_std": 0.4797287583351135, "rewards/reward_fn/mean": 2.54994535446167, "rewards/reward_fn/std": 0.4797287583351135, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 382.75, "completions/mean_terminated_length": 382.75, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.027898589158799194, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.018151523312553763, "learning_rate": 7.8952e-06, "loss": 0.0068, "num_tokens": 12201773.0, "reward": 3.4161887168884277, "reward_std": 0.8646740317344666, "rewards/reward_fn/mean": 3.4161887168884277, "rewards/reward_fn/std": 0.8646739721298218, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 499.71875, "completions/mean_terminated_length": 499.71875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.028004667444574097, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.018706355476751924, "learning_rate": 7.8948e-06, "loss": -0.0482, "num_tokens": 12246948.0, "reward": 2.617680072784424, "reward_std": 0.36669453978538513, "rewards/reward_fn/mean": 2.617680072784424, "rewards/reward_fn/std": 0.3666945695877075, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 204.09375, "completions/mean_terminated_length": 204.09375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.028110745730348996, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.037441390566527843, "learning_rate": 7.8944e-06, "loss": -0.026, "num_tokens": 12291847.0, "reward": 3.009129047393799, "reward_std": 0.32852211594581604, "rewards/reward_fn/mean": 3.009129047393799, "rewards/reward_fn/std": 0.32852208614349365, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 160.5625, "completions/mean_terminated_length": 160.5625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.0282168240161239, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.020008231746032834, "learning_rate": 7.894e-06, "loss": 0.0008, "num_tokens": 12335417.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.0283229023018988, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.023937980644404888, "learning_rate": 7.8936e-06, "loss": -0.0182, "num_tokens": 12374997.0, "reward": 1.8745217323303223, "reward_std": 0.3389831781387329, "rewards/reward_fn/mean": 1.8745217323303223, "rewards/reward_fn/std": 0.3389831483364105, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 223.4375, "completions/mean_terminated_length": 223.4375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.028428980587673704, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.021832899190485477, "learning_rate": 7.8932e-06, "loss": -0.1111, "num_tokens": 12416195.0, "reward": 2.994629144668579, "reward_std": 0.06882744282484055, "rewards/reward_fn/mean": 2.994629144668579, "rewards/reward_fn/std": 0.06882745027542114, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 251.25, "completions/mean_terminated_length": 251.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.028535058873448604, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.023315031314268708, "learning_rate": 7.8928e-06, "loss": 0.0371, "num_tokens": 12494667.0, "reward": 3.4436919689178467, "reward_std": 0.7342358827590942, "rewards/reward_fn/mean": 3.4436919689178467, "rewards/reward_fn/std": 0.7342358231544495, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 209.03125, "completions/mean_terminated_length": 209.03125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.028641137159223506, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.02203134004957974, "learning_rate": 7.8924e-06, "loss": 0.0179, "num_tokens": 12551692.0, "reward": 3.892535924911499, "reward_std": 0.4451846778392792, "rewards/reward_fn/mean": 3.892535924911499, "rewards/reward_fn/std": 0.4451846778392792, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 119.3125, "completions/mean_terminated_length": 119.3125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.02874721544499841, "frac_reward_zero_std": 0.0, "grad_norm": 3.71875, "kl": 0.023293037782423198, "learning_rate": 7.892e-06, "loss": 0.2132, "num_tokens": 12590710.0, "reward": 3.9130430221557617, "reward_std": 0.2241009622812271, "rewards/reward_fn/mean": 3.9130430221557617, "rewards/reward_fn/std": 0.2241009771823883, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 172.78125, "completions/mean_terminated_length": 172.78125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.028853293730773312, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.02628507581539452, "learning_rate": 7.8916e-06, "loss": 0.0783, "num_tokens": 12637647.0, "reward": 3.2433860301971436, "reward_std": 0.5124220252037048, "rewards/reward_fn/mean": 3.2433860301971436, "rewards/reward_fn/std": 0.5124220848083496, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 197.875, "completions/mean_terminated_length": 197.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.02895937201654821, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.02240662043914199, "learning_rate": 7.8912e-06, "loss": 0.0431, "num_tokens": 12675627.0, "reward": 2.7977585792541504, "reward_std": 0.06745248287916183, "rewards/reward_fn/mean": 2.7977585792541504, "rewards/reward_fn/std": 0.06745246052742004, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 300.96875, "completions/mean_terminated_length": 300.96875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.029065450302323114, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.020187442656606436, "learning_rate": 7.890799999999999e-06, "loss": 0.0213, "num_tokens": 12713418.0, "reward": 3.9665956497192383, "reward_std": 0.1889638453722, "rewards/reward_fn/mean": 3.9665956497192383, "rewards/reward_fn/std": 0.1889638453722, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 269.59375, "completions/mean_terminated_length": 269.59375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.029171528588098017, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.024061910109594464, "learning_rate": 7.8904e-06, "loss": -0.1597, "num_tokens": 12757149.0, "reward": 2.2087552547454834, "reward_std": 0.5087428689002991, "rewards/reward_fn/mean": 2.2087552547454834, "rewards/reward_fn/std": 0.5087428092956543, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 96.21875, "completions/mean_terminated_length": 96.21875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.02927760687387292, "frac_reward_zero_std": 1.0, "grad_norm": 0.1943359375, "kl": 0.026183703215792775, "learning_rate": 7.889999999999999e-06, "loss": 0.001, "num_tokens": 12797764.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 176.28125, "completions/mean_terminated_length": 176.28125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.02938368515964782, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.022986828815191984, "learning_rate": 7.8896e-06, "loss": 0.0009, "num_tokens": 12843533.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 154.15625, "completions/mean_terminated_length": 154.15625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.02948976344542272, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.021190900588408113, "learning_rate": 7.889199999999999e-06, "loss": 0.0008, "num_tokens": 12879122.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 180.46875, "completions/mean_terminated_length": 180.46875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.029595841731197624, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.029538776027038693, "learning_rate": 7.8888e-06, "loss": 0.0242, "num_tokens": 12913409.0, "reward": 3.92836332321167, "reward_std": 0.4052387773990631, "rewards/reward_fn/mean": 3.92836332321167, "rewards/reward_fn/std": 0.4052387773990631, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 333.78125, "completions/mean_terminated_length": 333.78125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.029701920016972527, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.021093905437737703, "learning_rate": 7.888399999999999e-06, "loss": 0.0428, "num_tokens": 12975962.0, "reward": 3.458969831466675, "reward_std": 0.5869243144989014, "rewards/reward_fn/mean": 3.458969831466675, "rewards/reward_fn/std": 0.5869242548942566, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 189.65625, "completions/mean_terminated_length": 189.65625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.029807998302747427, "frac_reward_zero_std": 0.0, "grad_norm": 3.453125, "kl": 0.028214870719239116, "learning_rate": 7.888e-06, "loss": 0.1478, "num_tokens": 13014543.0, "reward": 3.90067982673645, "reward_std": 0.4174049198627472, "rewards/reward_fn/mean": 3.90067982673645, "rewards/reward_fn/std": 0.4174049496650696, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 263.96875, "completions/mean_terminated_length": 263.96875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.02991407658852233, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.022579851211048663, "learning_rate": 7.887599999999999e-06, "loss": -0.0188, "num_tokens": 13040206.0, "reward": 3.9292826652526855, "reward_std": 0.20690298080444336, "rewards/reward_fn/mean": 3.9292826652526855, "rewards/reward_fn/std": 0.20690296590328217, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 453.9375, "completions/mean_terminated_length": 402.51611328125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.030020154874297232, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.017965498962439597, "learning_rate": 7.8872e-06, "loss": 0.1905, "num_tokens": 13089612.0, "reward": 3.508497953414917, "reward_std": 0.9100804924964905, "rewards/reward_fn/mean": 3.508497953414917, "rewards/reward_fn/std": 0.9100804924964905, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 403.90625, "completions/mean_terminated_length": 403.90625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.030126233160072135, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.01794223056640476, "learning_rate": 7.8868e-06, "loss": -0.1122, "num_tokens": 13136905.0, "reward": 2.901538372039795, "reward_std": 0.2880001664161682, "rewards/reward_fn/mean": 2.901538372039795, "rewards/reward_fn/std": 0.28800004720687866, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 193.9375, "completions/mean_terminated_length": 193.9375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.030232311445847034, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.027612620033323765, "learning_rate": 7.8864e-06, "loss": 0.028, "num_tokens": 13170535.0, "reward": 3.2102768421173096, "reward_std": 0.1197659894824028, "rewards/reward_fn/mean": 3.2102768421173096, "rewards/reward_fn/std": 0.11976601183414459, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 256.5, "completions/mean_terminated_length": 256.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.030338389731621937, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.023041230160742998, "learning_rate": 7.886e-06, "loss": 0.0101, "num_tokens": 13207031.0, "reward": 3.574709892272949, "reward_std": 0.5251328945159912, "rewards/reward_fn/mean": 3.574709892272949, "rewards/reward_fn/std": 0.5251328945159912, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 473.625, "completions/mean_terminated_length": 473.625, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.03044446801739684, "frac_reward_zero_std": 0.0, "grad_norm": 0.99609375, "kl": 0.01666165341157466, "learning_rate": 7.8856e-06, "loss": -0.0288, "num_tokens": 13259595.0, "reward": 2.8182735443115234, "reward_std": 0.23360121250152588, "rewards/reward_fn/mean": 2.8182735443115234, "rewards/reward_fn/std": 0.23360122740268707, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1065.0, "completions/max_terminated_length": 1065.0, "completions/mean_length": 309.34375, "completions/mean_terminated_length": 309.34375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.03055054630317174, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.019797870074398816, "learning_rate": 7.8852e-06, "loss": -0.0027, "num_tokens": 13313430.0, "reward": 3.900583505630493, "reward_std": 0.4218868315219879, "rewards/reward_fn/mean": 3.900583505630493, "rewards/reward_fn/std": 0.4218868315219879, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1104.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 285.25, "completions/mean_terminated_length": 285.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.030656624588946642, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.01816290069837123, "learning_rate": 7.8848e-06, "loss": 0.0179, "num_tokens": 13357246.0, "reward": 3.9340410232543945, "reward_std": 0.26044347882270813, "rewards/reward_fn/mean": 3.9340410232543945, "rewards/reward_fn/std": 0.26044347882270813, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2018.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 643.9375, "completions/mean_terminated_length": 643.9375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.030762702874721545, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.015570356510579586, "learning_rate": 7.8844e-06, "loss": 0.0858, "num_tokens": 13424508.0, "reward": 2.594058036804199, "reward_std": 0.24677404761314392, "rewards/reward_fn/mean": 2.594058036804199, "rewards/reward_fn/std": 0.24677406251430511, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 535.375, "completions/mean_terminated_length": 486.58062744140625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.030868781160496447, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.017655761679634452, "learning_rate": 7.884e-06, "loss": 0.1712, "num_tokens": 13456200.0, "reward": 3.049564838409424, "reward_std": 1.0932600498199463, "rewards/reward_fn/mean": 3.049564838409424, "rewards/reward_fn/std": 1.0932600498199463, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 273.4375, "completions/mean_terminated_length": 273.4375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.030974859446271347, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.0212192558683455, "learning_rate": 7.8836e-06, "loss": -0.0082, "num_tokens": 13495734.0, "reward": 2.9622116088867188, "reward_std": 0.20986686646938324, "rewards/reward_fn/mean": 2.9622116088867188, "rewards/reward_fn/std": 0.20986689627170563, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1755.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 581.125, "completions/mean_terminated_length": 581.125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.03108093773204625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.018238925491459668, "learning_rate": 7.8832e-06, "loss": 0.0007, "num_tokens": 13553274.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 272.6875, "completions/mean_terminated_length": 272.6875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.031187016017821152, "frac_reward_zero_std": 1.0, "grad_norm": 0.150390625, "kl": 0.021185664576478302, "learning_rate": 7.882799999999998e-06, "loss": 0.0008, "num_tokens": 13605840.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 239.71875, "completions/mean_terminated_length": 239.71875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.03129309430359605, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.027498473413288593, "learning_rate": 7.8824e-06, "loss": 0.1091, "num_tokens": 13650791.0, "reward": 3.749105215072632, "reward_std": 0.48261404037475586, "rewards/reward_fn/mean": 3.749105215072632, "rewards/reward_fn/std": 0.48261401057243347, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 110.8125, "completions/mean_terminated_length": 110.8125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.03139917258937096, "frac_reward_zero_std": 1.0, "grad_norm": 0.1474609375, "kl": 0.03405557991936803, "learning_rate": 7.882e-06, "loss": 0.0014, "num_tokens": 13671649.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 365.84375, "completions/mean_terminated_length": 365.84375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.03150525087514586, "frac_reward_zero_std": 1.0, "grad_norm": 0.10546875, "kl": 0.021357741905376315, "learning_rate": 7.8816e-06, "loss": 0.0009, "num_tokens": 13722908.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 301.8125, "completions/mean_terminated_length": 301.8125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.031611329160920756, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.02022064500488341, "learning_rate": 7.8812e-06, "loss": 0.0568, "num_tokens": 13758518.0, "reward": 3.889503002166748, "reward_std": 0.4602510929107666, "rewards/reward_fn/mean": 3.889503002166748, "rewards/reward_fn/std": 0.460251122713089, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1902.0, "completions/mean_length": 824.1875, "completions/mean_terminated_length": 784.7096557617188, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.03171740744669566, "frac_reward_zero_std": 0.0, "grad_norm": 0.90625, "kl": 0.011437640176154673, "learning_rate": 7.880799999999999e-06, "loss": 0.1652, "num_tokens": 13835068.0, "reward": 3.2491872310638428, "reward_std": 0.6871760487556458, "rewards/reward_fn/mean": 3.2491872310638428, "rewards/reward_fn/std": 0.6871760487556458, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 319.59375, "completions/mean_terminated_length": 319.59375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.03182348573247056, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.021747147664427757, "learning_rate": 7.8804e-06, "loss": 0.1173, "num_tokens": 13875791.0, "reward": 2.884753465652466, "reward_std": 0.29689720273017883, "rewards/reward_fn/mean": 2.884753465652466, "rewards/reward_fn/std": 0.2968972325325012, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 842.90625, "completions/mean_terminated_length": 762.5667114257812, "completions/min_length": 411.0, "completions/min_terminated_length": 411.0, "epoch": 0.03192956401824547, "frac_reward_zero_std": 0.0, "grad_norm": 0.94140625, "kl": 0.011148195248097181, "learning_rate": 7.879999999999999e-06, "loss": 0.0603, "num_tokens": 13952556.0, "reward": 2.296818256378174, "reward_std": 0.7491377592086792, "rewards/reward_fn/mean": 2.296818256378174, "rewards/reward_fn/std": 0.7491377592086792, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1247.0, "completions/max_terminated_length": 1247.0, "completions/mean_length": 287.40625, "completions/mean_terminated_length": 287.40625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.03203564230402037, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.026387330144643784, "learning_rate": 7.8796e-06, "loss": -0.0635, "num_tokens": 14014201.0, "reward": 3.5167012214660645, "reward_std": 0.7784268856048584, "rewards/reward_fn/mean": 3.5167012214660645, "rewards/reward_fn/std": 0.7784268856048584, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 256.3125, "completions/mean_terminated_length": 256.3125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.03214172058979527, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.01922440528869629, "learning_rate": 7.879199999999999e-06, "loss": 0.031, "num_tokens": 14063331.0, "reward": 3.7592110633850098, "reward_std": 0.5460023880004883, "rewards/reward_fn/mean": 3.7592110633850098, "rewards/reward_fn/std": 0.5460023880004883, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.03224779887557017, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.022301261080428958, "learning_rate": 7.8788e-06, "loss": 0.0009, "num_tokens": 14111927.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1119.0, "completions/mean_length": 565.21875, "completions/mean_terminated_length": 517.3870849609375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.03235387716134507, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.01948609808459878, "learning_rate": 7.878399999999999e-06, "loss": 0.2713, "num_tokens": 14167774.0, "reward": 2.894683837890625, "reward_std": 0.7454859614372253, "rewards/reward_fn/mean": 2.894683837890625, "rewards/reward_fn/std": 0.7454858422279358, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 125.625, "completions/mean_terminated_length": 125.625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.03245995544711997, "frac_reward_zero_std": 1.0, "grad_norm": 0.1962890625, "kl": 0.024616175913251936, "learning_rate": 7.878e-06, "loss": 0.001, "num_tokens": 14184178.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 184.34375, "completions/mean_terminated_length": 184.34375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.03256603373289488, "frac_reward_zero_std": 0.0, "grad_norm": 0.92578125, "kl": 0.028724384726956487, "learning_rate": 7.8776e-06, "loss": -0.0796, "num_tokens": 14227837.0, "reward": 3.9284615516662598, "reward_std": 0.40468308329582214, "rewards/reward_fn/mean": 3.9284615516662598, "rewards/reward_fn/std": 0.40468305349349976, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 285.34375, "completions/mean_terminated_length": 285.34375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.03267211201866978, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.01536768360529095, "learning_rate": 7.8772e-06, "loss": 0.0255, "num_tokens": 14272968.0, "reward": 2.920250415802002, "reward_std": 0.02926819771528244, "rewards/reward_fn/mean": 2.920250415802002, "rewards/reward_fn/std": 0.02926819957792759, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 204.5, "completions/mean_terminated_length": 204.5, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.03277819030444468, "frac_reward_zero_std": 0.0, "grad_norm": 3.1875, "kl": 0.025702545884996653, "learning_rate": 7.8768e-06, "loss": 0.1966, "num_tokens": 14313432.0, "reward": 3.9622316360473633, "reward_std": 0.2136494368314743, "rewards/reward_fn/mean": 3.9622316360473633, "rewards/reward_fn/std": 0.21364940702915192, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 303.46875, "completions/mean_terminated_length": 303.46875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.03288426859021958, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.017463624943047762, "learning_rate": 7.8764e-06, "loss": -0.0187, "num_tokens": 14358759.0, "reward": 3.6965291500091553, "reward_std": 0.6346907615661621, "rewards/reward_fn/mean": 3.6965291500091553, "rewards/reward_fn/std": 0.6346907615661621, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 323.84375, "completions/mean_terminated_length": 323.84375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.03299034687599448, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.02454505139030516, "learning_rate": 7.876e-06, "loss": -0.0422, "num_tokens": 14401026.0, "reward": 3.7419800758361816, "reward_std": 0.49619293212890625, "rewards/reward_fn/mean": 3.7419800758361816, "rewards/reward_fn/std": 0.49619296193122864, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 86.21875, "completions/mean_terminated_length": 86.21875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.03309642516176939, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.02460471703670919, "learning_rate": 7.8756e-06, "loss": 0.001, "num_tokens": 14444553.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 350.90625, "completions/mean_terminated_length": 350.90625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.03320250344754429, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.016784140723757446, "learning_rate": 7.8752e-06, "loss": -0.0192, "num_tokens": 14516326.0, "reward": 2.9048171043395996, "reward_std": 0.38163191080093384, "rewards/reward_fn/mean": 2.9048171043395996, "rewards/reward_fn/std": 0.38163191080093384, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 350.0625, "completions/mean_terminated_length": 295.2903137207031, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.03330858173331919, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.026702984934672713, "learning_rate": 7.8748e-06, "loss": 0.1718, "num_tokens": 14566088.0, "reward": 2.980964422225952, "reward_std": 1.1688123941421509, "rewards/reward_fn/mean": 2.980964422225952, "rewards/reward_fn/std": 1.1688123941421509, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2030.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 474.21875, "completions/mean_terminated_length": 474.21875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.03341466001909409, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.01943380292505026, "learning_rate": 7.874399999999999e-06, "loss": 0.081, "num_tokens": 14593871.0, "reward": 3.155921459197998, "reward_std": 0.9191026091575623, "rewards/reward_fn/mean": 3.155921459197998, "rewards/reward_fn/std": 0.9191026091575623, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 190.03125, "completions/mean_terminated_length": 190.03125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.03352073830486899, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.024673061445355415, "learning_rate": 7.874e-06, "loss": 0.0506, "num_tokens": 14634384.0, "reward": 2.570242404937744, "reward_std": 1.0145124197006226, "rewards/reward_fn/mean": 2.570242404937744, "rewards/reward_fn/std": 1.014512300491333, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 525.1875, "completions/mean_terminated_length": 525.1875, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.03362681659064389, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.02036218182183802, "learning_rate": 7.873599999999999e-06, "loss": -0.0318, "num_tokens": 14686390.0, "reward": 3.0451087951660156, "reward_std": 0.3707602620124817, "rewards/reward_fn/mean": 3.0451087951660156, "rewards/reward_fn/std": 0.3707602024078369, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 466.53125, "completions/mean_terminated_length": 415.51611328125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.0337328948764188, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.014624205301515758, "learning_rate": 7.8732e-06, "loss": 0.1828, "num_tokens": 14732903.0, "reward": 3.0857720375061035, "reward_std": 0.7979248762130737, "rewards/reward_fn/mean": 3.0857720375061035, "rewards/reward_fn/std": 0.797924816608429, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 344.84375, "completions/mean_terminated_length": 344.84375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.0338389731621937, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.02286008303053677, "learning_rate": 7.8728e-06, "loss": 0.0713, "num_tokens": 14776674.0, "reward": 2.7754576206207275, "reward_std": 0.20874054729938507, "rewards/reward_fn/mean": 2.7754576206207275, "rewards/reward_fn/std": 0.20874051749706268, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 347.28125, "completions/mean_terminated_length": 347.28125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.033945051447968604, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.01978490618057549, "learning_rate": 7.8724e-06, "loss": -0.0013, "num_tokens": 14825035.0, "reward": 2.7217154502868652, "reward_std": 0.0514136478304863, "rewards/reward_fn/mean": 2.7217154502868652, "rewards/reward_fn/std": 0.05141367390751839, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1224.0, "completions/max_terminated_length": 1224.0, "completions/mean_length": 319.34375, "completions/mean_terminated_length": 319.34375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.0340511297337435, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.01841491786763072, "learning_rate": 7.872e-06, "loss": 0.067, "num_tokens": 14884118.0, "reward": 3.8295905590057373, "reward_std": 0.5808995962142944, "rewards/reward_fn/mean": 3.8295905590057373, "rewards/reward_fn/std": 0.5808995962142944, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 412.78125, "completions/mean_terminated_length": 412.78125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.0341572080195184, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.015279840677976608, "learning_rate": 7.8716e-06, "loss": 0.0439, "num_tokens": 14942063.0, "reward": 3.450852870941162, "reward_std": 0.6985296607017517, "rewards/reward_fn/mean": 3.450852870941162, "rewards/reward_fn/std": 0.6985296607017517, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 123.1875, "completions/mean_terminated_length": 123.1875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.03426328630529331, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.01893604954238981, "learning_rate": 7.8712e-06, "loss": 0.0116, "num_tokens": 14967285.0, "reward": 3.701681613922119, "reward_std": 0.4854101240634918, "rewards/reward_fn/mean": 3.701681613922119, "rewards/reward_fn/std": 0.48541009426116943, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 294.5, "completions/mean_terminated_length": 294.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.03436936459106821, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.02002789406105876, "learning_rate": 7.8708e-06, "loss": -0.0665, "num_tokens": 15018021.0, "reward": 3.452935218811035, "reward_std": 0.6924206018447876, "rewards/reward_fn/mean": 3.452935218811035, "rewards/reward_fn/std": 0.6924206018447876, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1430.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 431.09375, "completions/mean_terminated_length": 431.09375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.03447544287684311, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.022256616270169616, "learning_rate": 7.8704e-06, "loss": 0.141, "num_tokens": 15068488.0, "reward": 3.595818519592285, "reward_std": 0.6756666898727417, "rewards/reward_fn/mean": 3.595818519592285, "rewards/reward_fn/std": 0.6756666898727417, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 260.5625, "completions/mean_terminated_length": 260.5625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.03458152116261801, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.01728574268054217, "learning_rate": 7.87e-06, "loss": 0.0252, "num_tokens": 15110010.0, "reward": 3.9640228748321533, "reward_std": 0.2035173624753952, "rewards/reward_fn/mean": 3.9640228748321533, "rewards/reward_fn/std": 0.2035173922777176, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 201.53125, "completions/mean_terminated_length": 201.53125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.03468759944839291, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.019541570218279958, "learning_rate": 7.8696e-06, "loss": 0.0094, "num_tokens": 15161835.0, "reward": 2.861398220062256, "reward_std": 0.22285261750221252, "rewards/reward_fn/mean": 2.861398220062256, "rewards/reward_fn/std": 0.22285260260105133, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 106.59375, "completions/mean_terminated_length": 106.59375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.03479367773416782, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "kl": 0.020829411456361413, "learning_rate": 7.869199999999999e-06, "loss": 0.0216, "num_tokens": 15199870.0, "reward": 3.7141122817993164, "reward_std": 0.46484532952308655, "rewards/reward_fn/mean": 3.7141122817993164, "rewards/reward_fn/std": 0.46484535932540894, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 348.71875, "completions/mean_terminated_length": 348.71875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.03489975601994272, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.021267442498356104, "learning_rate": 7.8688e-06, "loss": 0.0477, "num_tokens": 15240917.0, "reward": 2.838244915008545, "reward_std": 0.0607428215444088, "rewards/reward_fn/mean": 2.838244915008545, "rewards/reward_fn/std": 0.060742802917957306, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 261.0, "completions/mean_terminated_length": 261.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.03500583430571762, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.026442324509844184, "learning_rate": 7.8684e-06, "loss": -0.1112, "num_tokens": 15287893.0, "reward": 3.518838405609131, "reward_std": 0.8920819759368896, "rewards/reward_fn/mean": 3.518838405609131, "rewards/reward_fn/std": 0.8920818567276001, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 199.4375, "completions/mean_terminated_length": 199.4375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.035111912591492524, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.027465834515169263, "learning_rate": 7.868e-06, "loss": 0.0479, "num_tokens": 15326755.0, "reward": 3.804008722305298, "reward_std": 0.3705672025680542, "rewards/reward_fn/mean": 3.804008722305298, "rewards/reward_fn/std": 0.3705671727657318, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 330.84375, "completions/mean_terminated_length": 330.84375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.03521799087726742, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.0256270794197917, "learning_rate": 7.8676e-06, "loss": -0.0112, "num_tokens": 15370622.0, "reward": 2.7658400535583496, "reward_std": 0.19479042291641235, "rewards/reward_fn/mean": 2.7658400535583496, "rewards/reward_fn/std": 0.19479040801525116, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1153.0, "completions/mean_length": 780.15625, "completions/mean_terminated_length": 599.0357666015625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.03532406916304232, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.01505154138430953, "learning_rate": 7.8672e-06, "loss": 0.3167, "num_tokens": 15439171.0, "reward": 2.6815433502197266, "reward_std": 1.2791556119918823, "rewards/reward_fn/mean": 2.6815433502197266, "rewards/reward_fn/std": 1.2791556119918823, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 340.4375, "completions/mean_terminated_length": 340.4375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.03543014744881723, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.022124167764559388, "learning_rate": 7.866799999999999e-06, "loss": -0.0204, "num_tokens": 15488241.0, "reward": 3.7875170707702637, "reward_std": 0.6110662221908569, "rewards/reward_fn/mean": 3.7875170707702637, "rewards/reward_fn/std": 0.6110662221908569, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 251.84375, "completions/mean_terminated_length": 251.84375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.03553622573459213, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.023350659990683198, "learning_rate": 7.8664e-06, "loss": 0.013, "num_tokens": 15539180.0, "reward": 2.956249475479126, "reward_std": 0.4766891896724701, "rewards/reward_fn/mean": 2.956249475479126, "rewards/reward_fn/std": 0.4766892194747925, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 694.25, "completions/mean_terminated_length": 650.5806274414062, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.035642304020367034, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.02155219833366573, "learning_rate": 7.865999999999999e-06, "loss": 0.1593, "num_tokens": 15606964.0, "reward": 2.4331278800964355, "reward_std": 0.6054124236106873, "rewards/reward_fn/mean": 2.4331278800964355, "rewards/reward_fn/std": 0.6054123640060425, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1832.0, "completions/max_terminated_length": 1832.0, "completions/mean_length": 321.46875, "completions/mean_terminated_length": 321.46875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.03574838230614193, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.03199986438266933, "learning_rate": 7.8656e-06, "loss": 0.0461, "num_tokens": 15647139.0, "reward": 2.767442226409912, "reward_std": 0.20596739649772644, "rewards/reward_fn/mean": 2.767442226409912, "rewards/reward_fn/std": 0.20596742630004883, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 119.90625, "completions/mean_terminated_length": 119.90625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.03585446059191683, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "kl": 0.03643982787616551, "learning_rate": 7.865199999999999e-06, "loss": -0.0177, "num_tokens": 15696160.0, "reward": 3.380324602127075, "reward_std": 0.5965169072151184, "rewards/reward_fn/mean": 3.380324602127075, "rewards/reward_fn/std": 0.5965169072151184, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 206.71875, "completions/mean_terminated_length": 206.71875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.03596053887769174, "frac_reward_zero_std": 1.0, "grad_norm": 0.16015625, "kl": 0.030227781971916556, "learning_rate": 7.8648e-06, "loss": 0.0012, "num_tokens": 15742903.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 330.0625, "completions/mean_terminated_length": 330.0625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.03606661716346664, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.023755632108077407, "learning_rate": 7.864399999999999e-06, "loss": 0.0689, "num_tokens": 15790329.0, "reward": 2.75354266166687, "reward_std": 0.03267281502485275, "rewards/reward_fn/mean": 2.75354266166687, "rewards/reward_fn/std": 0.03267282247543335, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 263.46875, "completions/mean_terminated_length": 263.46875, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.03617269544924154, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.0320469920989126, "learning_rate": 7.864e-06, "loss": -0.1167, "num_tokens": 15817384.0, "reward": 3.7375216484069824, "reward_std": 0.8701768517494202, "rewards/reward_fn/mean": 3.7375216484069824, "rewards/reward_fn/std": 0.8701767921447754, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 158.28125, "completions/mean_terminated_length": 158.28125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.036278773735016444, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.03848015540279448, "learning_rate": 7.8636e-06, "loss": 0.0212, "num_tokens": 15860625.0, "reward": 3.2080063819885254, "reward_std": 0.7717536091804504, "rewards/reward_fn/mean": 3.2080063819885254, "rewards/reward_fn/std": 0.7717535495758057, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 620.875, "completions/mean_terminated_length": 525.7333374023438, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.03638485202079134, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.022421202156692743, "learning_rate": 7.8632e-06, "loss": 0.1028, "num_tokens": 15921357.0, "reward": 2.6614723205566406, "reward_std": 0.526772677898407, "rewards/reward_fn/mean": 2.6614723205566406, "rewards/reward_fn/std": 0.5267727375030518, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 184.40625, "completions/mean_terminated_length": 184.40625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.03649093030656624, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.02825386798940599, "learning_rate": 7.8628e-06, "loss": 0.0271, "num_tokens": 15961850.0, "reward": 3.163677215576172, "reward_std": 0.5745911598205566, "rewards/reward_fn/mean": 3.163677215576172, "rewards/reward_fn/std": 0.5745911002159119, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 263.0625, "completions/mean_terminated_length": 263.0625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.03659700859234115, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.03263953677378595, "learning_rate": 7.8624e-06, "loss": 0.0094, "num_tokens": 15981884.0, "reward": 3.4133460521698, "reward_std": 0.9804157614707947, "rewards/reward_fn/mean": 3.4133460521698, "rewards/reward_fn/std": 0.9804157018661499, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1789.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 376.6875, "completions/mean_terminated_length": 376.6875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.03670308687811605, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.028175218030810356, "learning_rate": 7.862e-06, "loss": -0.0734, "num_tokens": 16046098.0, "reward": 3.028233051300049, "reward_std": 0.3557929992675781, "rewards/reward_fn/mean": 3.028233051300049, "rewards/reward_fn/std": 0.3557929992675781, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 228.1875, "completions/mean_terminated_length": 228.1875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.036809165163890954, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.03550481074489653, "learning_rate": 7.8616e-06, "loss": 0.0374, "num_tokens": 16086616.0, "reward": 3.3797616958618164, "reward_std": 0.6313052177429199, "rewards/reward_fn/mean": 3.3797616958618164, "rewards/reward_fn/std": 0.6313052177429199, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 189.875, "completions/mean_terminated_length": 189.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.036915243449665854, "frac_reward_zero_std": 1.0, "grad_norm": 0.1435546875, "kl": 0.027385680470615625, "learning_rate": 7.8612e-06, "loss": 0.0011, "num_tokens": 16125396.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.03702132173544075, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.04070192761719227, "learning_rate": 7.8608e-06, "loss": 0.02, "num_tokens": 16168260.0, "reward": 3.54952335357666, "reward_std": 0.7186921834945679, "rewards/reward_fn/mean": 3.54952335357666, "rewards/reward_fn/std": 0.7186923027038574, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1830.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 363.90625, "completions/mean_terminated_length": 363.90625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.03712740002121566, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.026658386690542102, "learning_rate": 7.8604e-06, "loss": -0.0186, "num_tokens": 16211073.0, "reward": 3.8557639122009277, "reward_std": 0.5675714015960693, "rewards/reward_fn/mean": 3.8557639122009277, "rewards/reward_fn/std": 0.5675714015960693, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 291.09375, "completions/mean_terminated_length": 291.09375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.03723347830699056, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.028336241375654936, "learning_rate": 7.86e-06, "loss": -0.0066, "num_tokens": 16266500.0, "reward": 2.8183717727661133, "reward_std": 0.9820890426635742, "rewards/reward_fn/mean": 2.8183717727661133, "rewards/reward_fn/std": 0.9820890426635742, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 270.4375, "completions/mean_terminated_length": 270.4375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.03733955659276546, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.02873558783903718, "learning_rate": 7.8596e-06, "loss": -0.0038, "num_tokens": 16329586.0, "reward": 2.79421329498291, "reward_std": 0.18948742747306824, "rewards/reward_fn/mean": 2.79421329498291, "rewards/reward_fn/std": 0.18948739767074585, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1763.0, "completions/mean_length": 680.0625, "completions/mean_terminated_length": 588.86669921875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.037445634878540364, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.02555919042788446, "learning_rate": 7.8592e-06, "loss": 0.2532, "num_tokens": 16373236.0, "reward": 2.7877817153930664, "reward_std": 0.9033706784248352, "rewards/reward_fn/mean": 2.7877817153930664, "rewards/reward_fn/std": 0.9033706188201904, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 659.0, "completions/mean_length": 382.59375, "completions/mean_terminated_length": 328.8709716796875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.03755171316431526, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.024913710309192538, "learning_rate": 7.8588e-06, "loss": 0.1477, "num_tokens": 16427431.0, "reward": 2.9459004402160645, "reward_std": 1.0123151540756226, "rewards/reward_fn/mean": 2.9459004402160645, "rewards/reward_fn/std": 1.0123151540756226, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 121.125, "completions/mean_terminated_length": 121.125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.03765779145009017, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.022564243176020682, "learning_rate": 7.8584e-06, "loss": 0.0009, "num_tokens": 16467051.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 232.84375, "completions/mean_terminated_length": 232.84375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.03776386973586507, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.032447136007249355, "learning_rate": 7.858e-06, "loss": -0.007, "num_tokens": 16514694.0, "reward": 2.8739380836486816, "reward_std": 0.4234299659729004, "rewards/reward_fn/mean": 2.8739380836486816, "rewards/reward_fn/std": 0.423429936170578, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 184.78125, "completions/mean_terminated_length": 184.78125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.03786994802163997, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.03548223176039755, "learning_rate": 7.857599999999999e-06, "loss": 0.0056, "num_tokens": 16551295.0, "reward": 3.7814149856567383, "reward_std": 0.5462931990623474, "rewards/reward_fn/mean": 3.7814149856567383, "rewards/reward_fn/std": 0.5462931990623474, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 199.0, "completions/mean_terminated_length": 199.0, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.037976026307414874, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.02927296655252576, "learning_rate": 7.8572e-06, "loss": 0.0012, "num_tokens": 16601823.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 681.0, "completions/mean_terminated_length": 636.9031982421875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.038082104593189774, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.02082135993987322, "learning_rate": 7.856799999999999e-06, "loss": 0.054, "num_tokens": 16649087.0, "reward": 2.0146098136901855, "reward_std": 0.59377121925354, "rewards/reward_fn/mean": 2.0146098136901855, "rewards/reward_fn/std": 0.59377121925354, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 185.0, "completions/max_terminated_length": 185.0, "completions/mean_length": 115.71875, "completions/mean_terminated_length": 115.71875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.03818818287896467, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474609375, "kl": 0.02015709993429482, "learning_rate": 7.8564e-06, "loss": 0.0008, "num_tokens": 16679478.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 360.28125, "completions/mean_terminated_length": 360.28125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.03829426116473958, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.030813375022262335, "learning_rate": 7.855999999999999e-06, "loss": -0.0245, "num_tokens": 16727167.0, "reward": 2.8578453063964844, "reward_std": 0.0672156810760498, "rewards/reward_fn/mean": 2.8578453063964844, "rewards/reward_fn/std": 0.0672157034277916, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 116.78125, "completions/mean_terminated_length": 116.78125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.03840033945051448, "frac_reward_zero_std": 1.0, "grad_norm": 0.201171875, "kl": 0.03886803472414613, "learning_rate": 7.8556e-06, "loss": 0.0016, "num_tokens": 16768696.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 97.59375, "completions/mean_terminated_length": 97.59375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.038506417736289385, "frac_reward_zero_std": 1.0, "grad_norm": 0.1728515625, "kl": 0.02677657501772046, "learning_rate": 7.855199999999999e-06, "loss": 0.0011, "num_tokens": 16807979.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 323.0625, "completions/mean_terminated_length": 323.0625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.038612496022064284, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.025971536757424474, "learning_rate": 7.8548e-06, "loss": 0.0311, "num_tokens": 16855277.0, "reward": 3.0506677627563477, "reward_std": 0.511308491230011, "rewards/reward_fn/mean": 3.0506677627563477, "rewards/reward_fn/std": 0.511308491230011, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 440.84375, "completions/mean_terminated_length": 440.84375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.03871857430783918, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.028090456500649452, "learning_rate": 7.854399999999999e-06, "loss": 0.0004, "num_tokens": 16930856.0, "reward": 1.8972702026367188, "reward_std": 0.502529501914978, "rewards/reward_fn/mean": 1.8972702026367188, "rewards/reward_fn/std": 0.502529501914978, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 120.0, "completions/mean_terminated_length": 120.0, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.03882465259361409, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.017430690582841635, "learning_rate": 7.854e-06, "loss": 0.0007, "num_tokens": 16972104.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 264.5, "completions/mean_terminated_length": 264.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.03893073087938899, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.02535721193999052, "learning_rate": 7.8536e-06, "loss": 0.1011, "num_tokens": 17014200.0, "reward": 3.448211193084717, "reward_std": 0.4737915098667145, "rewards/reward_fn/mean": 3.448211193084717, "rewards/reward_fn/std": 0.4737914502620697, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 229.84375, "completions/mean_terminated_length": 229.84375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.03903680916516389, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.028732048347592354, "learning_rate": 7.8532e-06, "loss": 0.0519, "num_tokens": 17042675.0, "reward": 3.8204574584960938, "reward_std": 0.5905638337135315, "rewards/reward_fn/mean": 3.8204574584960938, "rewards/reward_fn/std": 0.5905638337135315, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 144.21875, "completions/mean_terminated_length": 144.21875, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.039142887450938794, "frac_reward_zero_std": 1.0, "grad_norm": 0.12255859375, "kl": 0.02795306663028896, "learning_rate": 7.8528e-06, "loss": 0.0011, "num_tokens": 17072858.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.039248965736713694, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.024855082854628563, "learning_rate": 7.8524e-06, "loss": 0.039, "num_tokens": 17111994.0, "reward": 3.0236971378326416, "reward_std": 0.18900470435619354, "rewards/reward_fn/mean": 3.0236971378326416, "rewards/reward_fn/std": 0.18900467455387115, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 169.6875, "completions/mean_terminated_length": 169.6875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.0393550440224886, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.036026960471645, "learning_rate": 7.852e-06, "loss": -0.0048, "num_tokens": 17161744.0, "reward": 3.9573440551757812, "reward_std": 0.24129843711853027, "rewards/reward_fn/mean": 3.9573440551757812, "rewards/reward_fn/std": 0.24129842221736908, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 207.90625, "completions/mean_terminated_length": 207.90625, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.0394611223082635, "frac_reward_zero_std": 1.0, "grad_norm": 0.10107421875, "kl": 0.026199826737865806, "learning_rate": 7.8516e-06, "loss": 0.001, "num_tokens": 17191821.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 195.40625, "completions/mean_terminated_length": 195.40625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.0395672005940384, "frac_reward_zero_std": 1.0, "grad_norm": 0.1630859375, "kl": 0.03569826763123274, "learning_rate": 7.8512e-06, "loss": 0.0014, "num_tokens": 17239802.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 500.3125, "completions/mean_terminated_length": 450.3870849609375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.039673278879813305, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.024202445056289434, "learning_rate": 7.8508e-06, "loss": 0.0908, "num_tokens": 17302948.0, "reward": 2.246391773223877, "reward_std": 0.7711318135261536, "rewards/reward_fn/mean": 2.246391773223877, "rewards/reward_fn/std": 0.7711318135261536, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 335.375, "completions/mean_terminated_length": 335.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.039779357165588204, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.0259898176882416, "learning_rate": 7.850399999999999e-06, "loss": 0.0233, "num_tokens": 17358384.0, "reward": 2.9620676040649414, "reward_std": 0.07208557426929474, "rewards/reward_fn/mean": 2.9620676040649414, "rewards/reward_fn/std": 0.07208552956581116, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 231.09375, "completions/mean_terminated_length": 231.09375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.039885435451363103, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.030499441782012582, "learning_rate": 7.85e-06, "loss": -0.0047, "num_tokens": 17396947.0, "reward": 3.423821449279785, "reward_std": 0.5857948660850525, "rewards/reward_fn/mean": 3.423821449279785, "rewards/reward_fn/std": 0.5857948660850525, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 451.96875, "completions/mean_terminated_length": 400.4838562011719, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.03999151373713801, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.022297584218904376, "learning_rate": 7.849599999999999e-06, "loss": 0.2615, "num_tokens": 17442866.0, "reward": 2.704645872116089, "reward_std": 0.6090136170387268, "rewards/reward_fn/mean": 2.704645872116089, "rewards/reward_fn/std": 0.6090136170387268, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 374.875, "completions/mean_terminated_length": 374.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.04009759202291291, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.021644736174494028, "learning_rate": 7.8492e-06, "loss": -0.0128, "num_tokens": 17487022.0, "reward": 2.907583713531494, "reward_std": 0.46556609869003296, "rewards/reward_fn/mean": 2.907583713531494, "rewards/reward_fn/std": 0.46556606888771057, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1103.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 316.59375, "completions/mean_terminated_length": 316.59375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.04020367030868781, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.02532161376439035, "learning_rate": 7.8488e-06, "loss": -0.0199, "num_tokens": 17530977.0, "reward": 3.670456886291504, "reward_std": 0.41091057658195496, "rewards/reward_fn/mean": 3.670456886291504, "rewards/reward_fn/std": 0.41091054677963257, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1597.0, "completions/max_terminated_length": 1597.0, "completions/mean_length": 468.9375, "completions/mean_terminated_length": 468.9375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.040309748594462715, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.018143147695809603, "learning_rate": 7.8484e-06, "loss": 0.1946, "num_tokens": 17587871.0, "reward": 2.6948769092559814, "reward_std": 0.26830726861953735, "rewards/reward_fn/mean": 2.6948769092559814, "rewards/reward_fn/std": 0.26830726861953735, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 357.5, "completions/mean_terminated_length": 302.9677429199219, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.040415826880237614, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.023040967527776957, "learning_rate": 7.848e-06, "loss": 0.2107, "num_tokens": 17636175.0, "reward": 3.7304601669311523, "reward_std": 0.7834086418151855, "rewards/reward_fn/mean": 3.7304601669311523, "rewards/reward_fn/std": 0.7834085822105408, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.04052190516601252, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.02284376136958599, "learning_rate": 7.8476e-06, "loss": 0.0009, "num_tokens": 17657579.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 89.5, "completions/mean_terminated_length": 89.5, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.04062798345178742, "frac_reward_zero_std": 1.0, "grad_norm": 0.2890625, "kl": 0.03317818860523403, "learning_rate": 7.8472e-06, "loss": 0.0013, "num_tokens": 17685723.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 1247.1875, "completions/mean_terminated_length": 1221.3548583984375, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 0.04073406173756232, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.013792349374853075, "learning_rate": 7.846799999999999e-06, "loss": -0.0208, "num_tokens": 17768065.0, "reward": 2.3582417964935303, "reward_std": 0.49515044689178467, "rewards/reward_fn/mean": 2.3582417964935303, "rewards/reward_fn/std": 0.4951504170894623, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 341.53125, "completions/mean_terminated_length": 341.53125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.040840140023337225, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.019186724559403956, "learning_rate": 7.8464e-06, "loss": 0.0339, "num_tokens": 17814194.0, "reward": 2.7873990535736084, "reward_std": 0.02758314460515976, "rewards/reward_fn/mean": 2.7873990535736084, "rewards/reward_fn/std": 0.027583174407482147, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 778.0, "completions/max_terminated_length": 778.0, "completions/mean_length": 384.78125, "completions/mean_terminated_length": 384.78125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.040946218309112124, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.024335808353498578, "learning_rate": 7.845999999999999e-06, "loss": -0.0277, "num_tokens": 17859755.0, "reward": 2.8218817710876465, "reward_std": 0.051523782312870026, "rewards/reward_fn/mean": 2.8218817710876465, "rewards/reward_fn/std": 0.05152379348874092, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 541.5625, "completions/mean_terminated_length": 492.96771240234375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.041052296594887024, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.02720357710495591, "learning_rate": 7.8456e-06, "loss": 0.041, "num_tokens": 17909629.0, "reward": 2.3109424114227295, "reward_std": 0.6412019729614258, "rewards/reward_fn/mean": 2.3109424114227295, "rewards/reward_fn/std": 0.641201913356781, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.04115837488066193, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.022277627140283585, "learning_rate": 7.845199999999999e-06, "loss": 0.2392, "num_tokens": 17937329.0, "reward": 3.7471237182617188, "reward_std": 0.4859867990016937, "rewards/reward_fn/mean": 3.7471237182617188, "rewards/reward_fn/std": 0.4859868288040161, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 181.625, "completions/mean_terminated_length": 181.625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.04126445316643683, "frac_reward_zero_std": 1.0, "grad_norm": 0.1337890625, "kl": 0.033335258485749364, "learning_rate": 7.8448e-06, "loss": 0.0013, "num_tokens": 17977157.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 997.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 270.53125, "completions/mean_terminated_length": 270.53125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.041370531452211735, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.024321889970451593, "learning_rate": 7.8444e-06, "loss": 0.001, "num_tokens": 18014102.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 241.03125, "completions/mean_terminated_length": 241.03125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.041476609737986635, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.02815451007336378, "learning_rate": 7.844e-06, "loss": 0.0235, "num_tokens": 18058551.0, "reward": 3.104393482208252, "reward_std": 0.4822303354740143, "rewards/reward_fn/mean": 3.104393482208252, "rewards/reward_fn/std": 0.48223036527633667, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 297.96875, "completions/mean_terminated_length": 297.96875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.041582688023761534, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.020035157212987542, "learning_rate": 7.8436e-06, "loss": -0.0126, "num_tokens": 18098454.0, "reward": 3.4708333015441895, "reward_std": 0.5728017091751099, "rewards/reward_fn/mean": 3.4708333015441895, "rewards/reward_fn/std": 0.5728016495704651, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 300.21875, "completions/mean_terminated_length": 300.21875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.04168876630953644, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.02791671548038721, "learning_rate": 7.8432e-06, "loss": 0.2627, "num_tokens": 18152669.0, "reward": 3.1782641410827637, "reward_std": 1.0244643688201904, "rewards/reward_fn/mean": 3.1782641410827637, "rewards/reward_fn/std": 1.0244643688201904, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 882.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 455.0, "completions/mean_terminated_length": 455.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.04179484459531134, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.02029509423300624, "learning_rate": 7.8428e-06, "loss": -0.0168, "num_tokens": 18200637.0, "reward": 2.588132381439209, "reward_std": 0.3573741018772125, "rewards/reward_fn/mean": 2.588132381439209, "rewards/reward_fn/std": 0.3573741316795349, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 179.90625, "completions/mean_terminated_length": 179.90625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.04190092288108624, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.05105016054585576, "learning_rate": 7.8424e-06, "loss": 0.0476, "num_tokens": 18236570.0, "reward": 2.7826709747314453, "reward_std": 0.04271591454744339, "rewards/reward_fn/mean": 2.7826709747314453, "rewards/reward_fn/std": 0.04271586239337921, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 266.65625, "completions/mean_terminated_length": 266.65625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.042007001166861145, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.023414030205458403, "learning_rate": 7.841999999999999e-06, "loss": 0.0183, "num_tokens": 18276303.0, "reward": 2.849137544631958, "reward_std": 0.30862998962402344, "rewards/reward_fn/mean": 2.849137544631958, "rewards/reward_fn/std": 0.30862998962402344, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 131.1875, "completions/mean_terminated_length": 131.1875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.042113079452636044, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.029712033225223422, "learning_rate": 7.8416e-06, "loss": -0.0091, "num_tokens": 18314901.0, "reward": 3.9370594024658203, "reward_std": 0.2479006052017212, "rewards/reward_fn/mean": 3.9370594024658203, "rewards/reward_fn/std": 0.24790059030056, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1483.0, "completions/mean_length": 583.15625, "completions/mean_terminated_length": 535.9031982421875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.04221915773841095, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.024179237661883235, "learning_rate": 7.841199999999999e-06, "loss": 0.2692, "num_tokens": 18366682.0, "reward": 2.734818935394287, "reward_std": 0.5017510056495667, "rewards/reward_fn/mean": 2.734818935394287, "rewards/reward_fn/std": 0.5017510056495667, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1412.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 448.9375, "completions/mean_terminated_length": 448.9375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.04232523602418585, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.021006060764193535, "learning_rate": 7.8408e-06, "loss": -0.0214, "num_tokens": 18432152.0, "reward": 3.894382953643799, "reward_std": 0.33419135212898254, "rewards/reward_fn/mean": 3.894382953643799, "rewards/reward_fn/std": 0.33419132232666016, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 788.9375, "completions/mean_terminated_length": 788.9375, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 0.04243131430996075, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.01675853121560067, "learning_rate": 7.840399999999999e-06, "loss": -0.0367, "num_tokens": 18497974.0, "reward": 2.990764856338501, "reward_std": 0.4567594826221466, "rewards/reward_fn/mean": 2.990764856338501, "rewards/reward_fn/std": 0.4567594528198242, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 191.0625, "completions/mean_terminated_length": 191.0625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.042537392595735656, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.03011680906638503, "learning_rate": 7.84e-06, "loss": 0.0012, "num_tokens": 18556440.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 357.9375, "completions/mean_terminated_length": 357.9375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.042643470881510555, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.021929903188720345, "learning_rate": 7.8396e-06, "loss": 0.0267, "num_tokens": 18602006.0, "reward": 3.1588587760925293, "reward_std": 0.5363696217536926, "rewards/reward_fn/mean": 3.1588587760925293, "rewards/reward_fn/std": 0.5363695621490479, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1226.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 370.78125, "completions/mean_terminated_length": 370.78125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.042749549167285454, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.023676603566855192, "learning_rate": 7.8392e-06, "loss": -0.0388, "num_tokens": 18650863.0, "reward": 2.7991724014282227, "reward_std": 0.27458456158638, "rewards/reward_fn/mean": 2.7991724014282227, "rewards/reward_fn/std": 0.2745845317840576, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 664.1875, "completions/mean_terminated_length": 664.1875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.04285562745306036, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.017545952810905874, "learning_rate": 7.8388e-06, "loss": 0.0957, "num_tokens": 18707061.0, "reward": 2.512760877609253, "reward_std": 0.5735504031181335, "rewards/reward_fn/mean": 2.512760877609253, "rewards/reward_fn/std": 0.5735504031181335, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 189.5625, "completions/mean_terminated_length": 189.5625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.04296170573883526, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.030075914692133665, "learning_rate": 7.8384e-06, "loss": 0.0012, "num_tokens": 18759783.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 341.96875, "completions/mean_terminated_length": 341.96875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.04306778402461016, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.02385372808203101, "learning_rate": 7.838e-06, "loss": 0.0794, "num_tokens": 18786086.0, "reward": 3.8906993865966797, "reward_std": 0.34537699818611145, "rewards/reward_fn/mean": 3.8906993865966797, "rewards/reward_fn/std": 0.34537696838378906, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.043173862310385065, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.03006741451099515, "learning_rate": 7.8376e-06, "loss": -0.0234, "num_tokens": 18831286.0, "reward": 3.787572145462036, "reward_std": 0.4494977593421936, "rewards/reward_fn/mean": 3.787572145462036, "rewards/reward_fn/std": 0.4494977295398712, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1561.0, "completions/mean_length": 529.125, "completions/mean_terminated_length": 480.1290283203125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.043279940596159965, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.0166288634063676, "learning_rate": 7.8372e-06, "loss": 0.2106, "num_tokens": 18929114.0, "reward": 3.8349366188049316, "reward_std": 0.735542893409729, "rewards/reward_fn/mean": 3.8349366188049316, "rewards/reward_fn/std": 0.7355428338050842, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 198.0, "completions/max_terminated_length": 198.0, "completions/mean_length": 156.15625, "completions/mean_terminated_length": 156.15625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.04338601888193487, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.024419703288003802, "learning_rate": 7.8368e-06, "loss": 0.001, "num_tokens": 18952959.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 218.375, "completions/mean_terminated_length": 218.375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.04349209716770977, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.033032523933798075, "learning_rate": 7.8364e-06, "loss": -0.0199, "num_tokens": 18995531.0, "reward": 3.660304069519043, "reward_std": 0.5844630002975464, "rewards/reward_fn/mean": 3.660304069519043, "rewards/reward_fn/std": 0.5844630002975464, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 290.40625, "completions/mean_terminated_length": 290.40625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.04359817545348467, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.028384562116116285, "learning_rate": 7.836e-06, "loss": 0.0034, "num_tokens": 19039832.0, "reward": 3.966031551361084, "reward_std": 0.19215430319309235, "rewards/reward_fn/mean": 3.966031551361084, "rewards/reward_fn/std": 0.19215430319309235, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 73.25, "completions/mean_terminated_length": 73.25, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.043704253739259576, "frac_reward_zero_std": 1.0, "grad_norm": 0.201171875, "kl": 0.024869739543646574, "learning_rate": 7.8356e-06, "loss": 0.001, "num_tokens": 19073664.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 566.1875, "completions/mean_terminated_length": 518.3870849609375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.043810332025034475, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.029686052352190018, "learning_rate": 7.8352e-06, "loss": 0.2334, "num_tokens": 19127142.0, "reward": 2.6168479919433594, "reward_std": 0.543907880783081, "rewards/reward_fn/mean": 2.6168479919433594, "rewards/reward_fn/std": 0.5439079403877258, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 143.03125, "completions/mean_terminated_length": 143.03125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.043916410310809374, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494140625, "kl": 0.03072983492165804, "learning_rate": 7.8348e-06, "loss": 0.0012, "num_tokens": 19166471.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 170.53125, "completions/mean_terminated_length": 170.53125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.04402248859658428, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.025355301331728697, "learning_rate": 7.834399999999999e-06, "loss": 0.001, "num_tokens": 19204504.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 242.65625, "completions/mean_terminated_length": 242.65625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.04412856688235918, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.024939125403761864, "learning_rate": 7.834e-06, "loss": 0.0628, "num_tokens": 19252557.0, "reward": 2.96942138671875, "reward_std": 0.08691102266311646, "rewards/reward_fn/mean": 2.96942138671875, "rewards/reward_fn/std": 0.08691102266311646, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 344.25, "completions/mean_terminated_length": 344.25, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.044234645168134086, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.02124662697315216, "learning_rate": 7.833599999999999e-06, "loss": -0.0249, "num_tokens": 19302485.0, "reward": 1.736589789390564, "reward_std": 0.022552739828824997, "rewards/reward_fn/mean": 1.736589789390564, "rewards/reward_fn/std": 0.022552751004695892, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 296.3125, "completions/mean_terminated_length": 296.3125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.044340723453908985, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.025449416134506464, "learning_rate": 7.8332e-06, "loss": 0.0491, "num_tokens": 19347167.0, "reward": 3.7305917739868164, "reward_std": 0.5530329942703247, "rewards/reward_fn/mean": 3.7305917739868164, "rewards/reward_fn/std": 0.5530329942703247, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 293.28125, "completions/mean_terminated_length": 293.28125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.044446801739683885, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.018630424048751593, "learning_rate": 7.832799999999999e-06, "loss": 0.0007, "num_tokens": 19389608.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 239.65625, "completions/mean_terminated_length": 239.65625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.04455288002545879, "frac_reward_zero_std": 1.0, "grad_norm": 0.09130859375, "kl": 0.02400025725364685, "learning_rate": 7.8324e-06, "loss": 0.001, "num_tokens": 19432349.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 171.9375, "completions/mean_terminated_length": 171.9375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.04465895831123369, "frac_reward_zero_std": 1.0, "grad_norm": 0.1923828125, "kl": 0.04056114191189408, "learning_rate": 7.831999999999999e-06, "loss": 0.0016, "num_tokens": 19462107.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 238.71875, "completions/mean_terminated_length": 238.71875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.04476503659700859, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.02605710015632212, "learning_rate": 7.8316e-06, "loss": 0.1752, "num_tokens": 19518066.0, "reward": 3.8169188499450684, "reward_std": 0.602931559085846, "rewards/reward_fn/mean": 3.8169188499450684, "rewards/reward_fn/std": 0.602931559085846, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 184.21875, "completions/mean_terminated_length": 184.21875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.044871114882783496, "frac_reward_zero_std": 1.0, "grad_norm": 0.099609375, "kl": 0.02194814942777157, "learning_rate": 7.831199999999999e-06, "loss": 0.0009, "num_tokens": 19562905.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 260.3125, "completions/mean_terminated_length": 260.3125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.044977193168558395, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.02712744101881981, "learning_rate": 7.8308e-06, "loss": -0.0232, "num_tokens": 19615107.0, "reward": 3.930159091949463, "reward_std": 0.39508044719696045, "rewards/reward_fn/mean": 3.930159091949463, "rewards/reward_fn/std": 0.39508041739463806, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1212.0, "completions/mean_length": 549.90625, "completions/mean_terminated_length": 501.58062744140625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.0450832714543333, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.021885179448872805, "learning_rate": 7.8304e-06, "loss": 0.1124, "num_tokens": 19667232.0, "reward": 3.4282619953155518, "reward_std": 0.8545926809310913, "rewards/reward_fn/mean": 3.4282619953155518, "rewards/reward_fn/std": 0.8545926213264465, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 119.40625, "completions/mean_terminated_length": 119.40625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.0451893497401082, "frac_reward_zero_std": 1.0, "grad_norm": 0.0986328125, "kl": 0.01785171974916011, "learning_rate": 7.83e-06, "loss": 0.0007, "num_tokens": 19715853.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1158.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 403.90625, "completions/mean_terminated_length": 403.90625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.0452954280258831, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.022983923787251115, "learning_rate": 7.8296e-06, "loss": -0.0171, "num_tokens": 19759690.0, "reward": 2.5151357650756836, "reward_std": 0.5577123761177063, "rewards/reward_fn/mean": 2.5151357650756836, "rewards/reward_fn/std": 0.5577123165130615, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 175.34375, "completions/mean_terminated_length": 175.34375, "completions/min_length": 17.0, "completions/min_terminated_length": 17.0, "epoch": 0.045401506311658006, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.029623025562614202, "learning_rate": 7.8292e-06, "loss": -0.1286, "num_tokens": 19800661.0, "reward": 3.7741332054138184, "reward_std": 0.7420908808708191, "rewards/reward_fn/mean": 3.7741332054138184, "rewards/reward_fn/std": 0.7420908808708191, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 285.15625, "completions/mean_terminated_length": 285.15625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.045507584597432905, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.02174567524343729, "learning_rate": 7.8288e-06, "loss": 0.0009, "num_tokens": 19856442.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1446.0, "completions/max_terminated_length": 1446.0, "completions/mean_length": 676.75, "completions/mean_terminated_length": 676.75, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.045613662883207805, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.019659652840346098, "learning_rate": 7.8284e-06, "loss": 0.0379, "num_tokens": 19922322.0, "reward": 2.3661680221557617, "reward_std": 0.5572351217269897, "rewards/reward_fn/mean": 2.3661680221557617, "rewards/reward_fn/std": 0.5572351813316345, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 362.625, "completions/mean_terminated_length": 362.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.04571974116898271, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.025779376970604062, "learning_rate": 7.828e-06, "loss": -0.0389, "num_tokens": 19969542.0, "reward": 2.871279239654541, "reward_std": 1.0416733026504517, "rewards/reward_fn/mean": 2.871279239654541, "rewards/reward_fn/std": 1.0416733026504517, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 303.84375, "completions/mean_terminated_length": 303.84375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.04582581945475761, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.019519688561558723, "learning_rate": 7.8276e-06, "loss": 0.0645, "num_tokens": 19999745.0, "reward": 3.9620537757873535, "reward_std": 0.21465659141540527, "rewards/reward_fn/mean": 3.9620537757873535, "rewards/reward_fn/std": 0.21465659141540527, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1192.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 399.125, "completions/mean_terminated_length": 399.125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.04593189774053251, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.018953290185891092, "learning_rate": 7.8272e-06, "loss": 0.0483, "num_tokens": 20061317.0, "reward": 2.9752299785614014, "reward_std": 0.07937107980251312, "rewards/reward_fn/mean": 2.9752299785614014, "rewards/reward_fn/std": 0.07937107235193253, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 119.875, "completions/mean_terminated_length": 119.875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.046037976026307416, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.027189034270122647, "learning_rate": 7.8268e-06, "loss": 0.0011, "num_tokens": 20088545.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 295.03125, "completions/mean_terminated_length": 295.03125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.046144054312082315, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.026012770365923643, "learning_rate": 7.826399999999998e-06, "loss": 0.0249, "num_tokens": 20128834.0, "reward": 2.9719934463500977, "reward_std": 0.24174726009368896, "rewards/reward_fn/mean": 2.9719934463500977, "rewards/reward_fn/std": 0.24174723029136658, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 432.125, "completions/mean_terminated_length": 432.125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.04625013259785722, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.01361775363329798, "learning_rate": 7.826e-06, "loss": -0.0077, "num_tokens": 20186982.0, "reward": 2.8467493057250977, "reward_std": 0.28382164239883423, "rewards/reward_fn/mean": 2.8467493057250977, "rewards/reward_fn/std": 0.28382164239883423, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 257.0625, "completions/mean_terminated_length": 257.0625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.04635621088363212, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.023967791348695755, "learning_rate": 7.8256e-06, "loss": 0.0345, "num_tokens": 20246248.0, "reward": 3.7507588863372803, "reward_std": 0.47892749309539795, "rewards/reward_fn/mean": 3.7507588863372803, "rewards/reward_fn/std": 0.47892752289772034, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 285.9375, "completions/mean_terminated_length": 285.9375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.04646228916940702, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.02458371128886938, "learning_rate": 7.8252e-06, "loss": -0.0299, "num_tokens": 20291078.0, "reward": 3.627525806427002, "reward_std": 0.4593088626861572, "rewards/reward_fn/mean": 3.627525806427002, "rewards/reward_fn/std": 0.45930883288383484, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 258.09375, "completions/mean_terminated_length": 258.09375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.046568367455181926, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.01905307243578136, "learning_rate": 7.8248e-06, "loss": -0.039, "num_tokens": 20332777.0, "reward": 3.9657504558563232, "reward_std": 0.19374500215053558, "rewards/reward_fn/mean": 3.9657504558563232, "rewards/reward_fn/std": 0.19374501705169678, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 180.625, "completions/mean_terminated_length": 180.625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.046674445740956826, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.021813688217662275, "learning_rate": 7.824399999999999e-06, "loss": -0.0704, "num_tokens": 20378941.0, "reward": 3.5305161476135254, "reward_std": 0.44998371601104736, "rewards/reward_fn/mean": 3.5305161476135254, "rewards/reward_fn/std": 0.449983686208725, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 221.03125, "completions/mean_terminated_length": 221.03125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.046780524026731725, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.027017208514735103, "learning_rate": 7.824e-06, "loss": 0.0379, "num_tokens": 20422366.0, "reward": 3.9637999534606934, "reward_std": 0.2047785222530365, "rewards/reward_fn/mean": 3.9637999534606934, "rewards/reward_fn/std": 0.2047785222530365, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 175.125, "completions/mean_terminated_length": 175.125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.04688660231250663, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.025995554169639945, "learning_rate": 7.823599999999999e-06, "loss": 0.001, "num_tokens": 20464226.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 353.53125, "completions/mean_terminated_length": 353.53125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.04699268059828153, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.023292170371860266, "learning_rate": 7.8232e-06, "loss": 0.0398, "num_tokens": 20510323.0, "reward": 3.5015358924865723, "reward_std": 0.5767775177955627, "rewards/reward_fn/mean": 3.5015358924865723, "rewards/reward_fn/std": 0.5767775177955627, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 313.78125, "completions/mean_terminated_length": 313.78125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.04709875888405644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.019186518737114966, "learning_rate": 7.822799999999999e-06, "loss": 0.0008, "num_tokens": 20552812.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 226.75, "completions/mean_terminated_length": 226.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.047204837169831336, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.0306410426273942, "learning_rate": 7.8224e-06, "loss": 0.0012, "num_tokens": 20592644.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 241.59375, "completions/mean_terminated_length": 241.59375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.047310915455606235, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.02532052854076028, "learning_rate": 7.821999999999999e-06, "loss": 0.1193, "num_tokens": 20617431.0, "reward": 2.9998433589935303, "reward_std": 0.0696294903755188, "rewards/reward_fn/mean": 2.9998433589935303, "rewards/reward_fn/std": 0.0696294978260994, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 453.90625, "completions/mean_terminated_length": 402.4838562011719, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.04741699374138114, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.016588653321377933, "learning_rate": 7.8216e-06, "loss": 0.1984, "num_tokens": 20653140.0, "reward": 2.85459041595459, "reward_std": 0.6958485841751099, "rewards/reward_fn/mean": 2.85459041595459, "rewards/reward_fn/std": 0.6958485841751099, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 184.4375, "completions/mean_terminated_length": 184.4375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.04752307202715604, "frac_reward_zero_std": 1.0, "grad_norm": 0.09130859375, "kl": 0.019108422100543976, "learning_rate": 7.8212e-06, "loss": 0.0008, "num_tokens": 20698146.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 247.53125, "completions/mean_terminated_length": 247.53125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.04762915031293094, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.02202951116487384, "learning_rate": 7.8208e-06, "loss": 0.0009, "num_tokens": 20758035.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 123.84375, "completions/mean_terminated_length": 123.84375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.047735228598705846, "frac_reward_zero_std": 1.0, "grad_norm": 0.16796875, "kl": 0.03342678747139871, "learning_rate": 7.8204e-06, "loss": 0.0013, "num_tokens": 20794574.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 123.25, "completions/mean_terminated_length": 123.25, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.047841306884480746, "frac_reward_zero_std": 1.0, "grad_norm": 0.111328125, "kl": 0.02307695336639881, "learning_rate": 7.82e-06, "loss": 0.0009, "num_tokens": 20839446.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 356.28125, "completions/mean_terminated_length": 356.28125, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.04794738517025565, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.021392826223745942, "learning_rate": 7.8196e-06, "loss": 0.0044, "num_tokens": 20888191.0, "reward": 3.5172977447509766, "reward_std": 0.7753545045852661, "rewards/reward_fn/mean": 3.5172977447509766, "rewards/reward_fn/std": 0.7753545045852661, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 220.09375, "completions/mean_terminated_length": 220.09375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.04805346345603055, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.029516591923311353, "learning_rate": 7.8192e-06, "loss": 0.0179, "num_tokens": 20912642.0, "reward": 3.9728949069976807, "reward_std": 0.15332958102226257, "rewards/reward_fn/mean": 3.9728949069976807, "rewards/reward_fn/std": 0.15332959592342377, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 210.84375, "completions/mean_terminated_length": 210.84375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.04815954174180545, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.031340898014605045, "learning_rate": 7.8188e-06, "loss": 0.2922, "num_tokens": 20958333.0, "reward": 3.886120557785034, "reward_std": 0.4653671979904175, "rewards/reward_fn/mean": 3.886120557785034, "rewards/reward_fn/std": 0.4653671979904175, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 188.5625, "completions/mean_terminated_length": 188.5625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.04826562002758036, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.02241483051329851, "learning_rate": 7.8184e-06, "loss": -0.0742, "num_tokens": 20995407.0, "reward": 2.9604923725128174, "reward_std": 0.46088868379592896, "rewards/reward_fn/mean": 2.9604923725128174, "rewards/reward_fn/std": 0.46088865399360657, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 172.625, "completions/mean_terminated_length": 172.625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.048371698313355256, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.021063596475869417, "learning_rate": 7.817999999999999e-06, "loss": 0.0673, "num_tokens": 21048995.0, "reward": 3.8552751541137695, "reward_std": 0.3901585340499878, "rewards/reward_fn/mean": 3.8552751541137695, "rewards/reward_fn/std": 0.3901585042476654, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 152.8125, "completions/mean_terminated_length": 152.8125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.048477776599130155, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.02626768359914422, "learning_rate": 7.8176e-06, "loss": 0.0393, "num_tokens": 21088477.0, "reward": 2.950690746307373, "reward_std": 0.06558680534362793, "rewards/reward_fn/mean": 2.950690746307373, "rewards/reward_fn/std": 0.06558679044246674, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1326.0, "completions/mean_length": 729.9375, "completions/mean_terminated_length": 642.0667114257812, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.04858385488490506, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.022620241856202483, "learning_rate": 7.817199999999999e-06, "loss": 0.3226, "num_tokens": 21145979.0, "reward": 2.522402286529541, "reward_std": 0.7907276749610901, "rewards/reward_fn/mean": 2.522402286529541, "rewards/reward_fn/std": 0.7907276749610901, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.04868993317067996, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.029362128349021077, "learning_rate": 7.8168e-06, "loss": -0.0061, "num_tokens": 21197503.0, "reward": 3.963871479034424, "reward_std": 0.20437325537204742, "rewards/reward_fn/mean": 3.963871479034424, "rewards/reward_fn/std": 0.20437327027320862, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 252.65625, "completions/mean_terminated_length": 252.65625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.04879601145645486, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.025767018785700202, "learning_rate": 7.8164e-06, "loss": 0.001, "num_tokens": 21244596.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 299.65625, "completions/mean_terminated_length": 299.65625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.048902089742229767, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.024106395663693547, "learning_rate": 7.816e-06, "loss": 0.001, "num_tokens": 21296137.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 304.3125, "completions/mean_terminated_length": 304.3125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.049008168028004666, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.026658632792532444, "learning_rate": 7.8156e-06, "loss": -0.0012, "num_tokens": 21329907.0, "reward": 2.8445465564727783, "reward_std": 0.2993001639842987, "rewards/reward_fn/mean": 2.8445465564727783, "rewards/reward_fn/std": 0.2993001937866211, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 521.0, "completions/mean_terminated_length": 471.7419128417969, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.04911424631377957, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.02180269779637456, "learning_rate": 7.8152e-06, "loss": 0.2696, "num_tokens": 21368947.0, "reward": 2.6146130561828613, "reward_std": 0.5501555800437927, "rewards/reward_fn/mean": 2.6146130561828613, "rewards/reward_fn/std": 0.550155520439148, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 392.78125, "completions/mean_terminated_length": 392.78125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.04922032459955447, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.024015987291932106, "learning_rate": 7.8148e-06, "loss": 0.086, "num_tokens": 21417260.0, "reward": 2.8569464683532715, "reward_std": 0.05871182680130005, "rewards/reward_fn/mean": 2.8569464683532715, "rewards/reward_fn/std": 0.058711789548397064, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 618.625, "completions/mean_terminated_length": 523.3333740234375, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.04932640288532937, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.01614295574836433, "learning_rate": 7.8144e-06, "loss": 0.3621, "num_tokens": 21479552.0, "reward": 3.6563806533813477, "reward_std": 0.9594557881355286, "rewards/reward_fn/mean": 3.6563806533813477, "rewards/reward_fn/std": 0.9594557881355286, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 319.65625, "completions/mean_terminated_length": 319.65625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.04943248117110428, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.01879534707404673, "learning_rate": 7.814e-06, "loss": 0.0444, "num_tokens": 21522613.0, "reward": 1.6788173913955688, "reward_std": 0.026051480323076248, "rewards/reward_fn/mean": 1.6788173913955688, "rewards/reward_fn/std": 0.026051471009850502, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 98.46875, "completions/mean_terminated_length": 98.46875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.049538559456879176, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.028762757312506437, "learning_rate": 7.8136e-06, "loss": -0.0682, "num_tokens": 21563364.0, "reward": 3.875, "reward_std": 0.7071067690849304, "rewards/reward_fn/mean": 3.875, "rewards/reward_fn/std": 0.7071067690849304, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1853.0, "completions/max_terminated_length": 1853.0, "completions/mean_length": 643.59375, "completions/mean_terminated_length": 643.59375, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.049644637742654076, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.030355968279764056, "learning_rate": 7.8132e-06, "loss": -0.0552, "num_tokens": 21614039.0, "reward": 2.8546223640441895, "reward_std": 0.9961849451065063, "rewards/reward_fn/mean": 2.8546223640441895, "rewards/reward_fn/std": 0.9961848855018616, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 219.9375, "completions/mean_terminated_length": 219.9375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.04975071602842898, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.024273998336866498, "learning_rate": 7.812799999999999e-06, "loss": 0.0575, "num_tokens": 21655349.0, "reward": 3.557422161102295, "reward_std": 0.5809412598609924, "rewards/reward_fn/mean": 3.557422161102295, "rewards/reward_fn/std": 0.5809412002563477, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 195.125, "completions/mean_terminated_length": 195.125, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "epoch": 0.04985679431420388, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.02798668248578906, "learning_rate": 7.8124e-06, "loss": -0.168, "num_tokens": 21694585.0, "reward": 3.875, "reward_std": 0.7071067690849304, "rewards/reward_fn/mean": 3.875, "rewards/reward_fn/std": 0.7071067690849304, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 497.59375, "completions/mean_terminated_length": 447.58062744140625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.04996287259997879, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.017523658694699407, "learning_rate": 7.812e-06, "loss": 0.2898, "num_tokens": 21750412.0, "reward": 2.634608268737793, "reward_std": 0.5835399031639099, "rewards/reward_fn/mean": 2.634608268737793, "rewards/reward_fn/std": 0.5835399031639099, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 279.34375, "completions/mean_terminated_length": 279.34375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.05006895088575369, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.027581357397139072, "learning_rate": 7.8116e-06, "loss": 0.0572, "num_tokens": 21807479.0, "reward": 3.4291439056396484, "reward_std": 0.8019877076148987, "rewards/reward_fn/mean": 3.4291439056396484, "rewards/reward_fn/std": 0.8019877076148987, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 190.90625, "completions/mean_terminated_length": 190.90625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.050175029171528586, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.040224543772637844, "learning_rate": 7.8112e-06, "loss": 0.001, "num_tokens": 21854292.0, "reward": 3.567188262939453, "reward_std": 0.5695129632949829, "rewards/reward_fn/mean": 3.567188262939453, "rewards/reward_fn/std": 0.5695129036903381, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 204.84375, "completions/mean_terminated_length": 204.84375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.05028110745730349, "frac_reward_zero_std": 1.0, "grad_norm": 0.1201171875, "kl": 0.029980882070958614, "learning_rate": 7.8108e-06, "loss": 0.0012, "num_tokens": 21899471.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 113.125, "completions/mean_terminated_length": 113.125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.05038718574307839, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.028396222507581115, "learning_rate": 7.810399999999999e-06, "loss": -0.0355, "num_tokens": 21924563.0, "reward": 3.928886651992798, "reward_std": 0.40227818489074707, "rewards/reward_fn/mean": 3.928886651992798, "rewards/reward_fn/std": 0.40227818489074707, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 334.0625, "completions/mean_terminated_length": 334.0625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.05049326402885329, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.020257814088836312, "learning_rate": 7.81e-06, "loss": 0.0086, "num_tokens": 21961941.0, "reward": 2.9945178031921387, "reward_std": 0.0230459775775671, "rewards/reward_fn/mean": 2.9945178031921387, "rewards/reward_fn/std": 0.02304600365459919, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 158.6875, "completions/mean_terminated_length": 158.6875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.0505993423146282, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.0344227678142488, "learning_rate": 7.809599999999999e-06, "loss": -0.0361, "num_tokens": 22013227.0, "reward": 3.5052218437194824, "reward_std": 0.6092379689216614, "rewards/reward_fn/mean": 3.5052218437194824, "rewards/reward_fn/std": 0.6092379689216614, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 168.21875, "completions/mean_terminated_length": 168.21875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.050705420600403096, "frac_reward_zero_std": 1.0, "grad_norm": 0.185546875, "kl": 0.028984917444176972, "learning_rate": 7.8092e-06, "loss": 0.0012, "num_tokens": 22053618.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 286.9375, "completions/mean_terminated_length": 286.9375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.050811498886178, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.022939151618629694, "learning_rate": 7.808799999999999e-06, "loss": -0.0067, "num_tokens": 22097488.0, "reward": 2.780938148498535, "reward_std": 0.5519546866416931, "rewards/reward_fn/mean": 2.780938148498535, "rewards/reward_fn/std": 0.5519546866416931, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 172.59375, "completions/mean_terminated_length": 172.59375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.0509175771719529, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.028172635240480304, "learning_rate": 7.8084e-06, "loss": -0.0612, "num_tokens": 22147619.0, "reward": 3.802846670150757, "reward_std": 0.3469943106174469, "rewards/reward_fn/mean": 3.802846670150757, "rewards/reward_fn/std": 0.3469943106174469, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 185.96875, "completions/mean_terminated_length": 185.96875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.0510236554577278, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.02581855608150363, "learning_rate": 7.807999999999999e-06, "loss": -0.016, "num_tokens": 22184738.0, "reward": 3.9292778968811035, "reward_std": 0.27909815311431885, "rewards/reward_fn/mean": 3.9292778968811035, "rewards/reward_fn/std": 0.27909815311431885, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 219.59375, "completions/mean_terminated_length": 219.59375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.05112973374350271, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.023512960644438863, "learning_rate": 7.8076e-06, "loss": 0.0226, "num_tokens": 22225749.0, "reward": 3.501314640045166, "reward_std": 0.5752494931221008, "rewards/reward_fn/mean": 3.501314640045166, "rewards/reward_fn/std": 0.575249433517456, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 269.03125, "completions/mean_terminated_length": 269.03125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.05123581202927761, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.02642969344742596, "learning_rate": 7.8072e-06, "loss": -0.0279, "num_tokens": 22272534.0, "reward": 1.7679412364959717, "reward_std": 0.03526504710316658, "rewards/reward_fn/mean": 1.7679412364959717, "rewards/reward_fn/std": 0.03526502102613449, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 243.46875, "completions/mean_terminated_length": 243.46875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.051341890315052506, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.018053250038065016, "learning_rate": 7.8068e-06, "loss": 0.0794, "num_tokens": 22319717.0, "reward": 3.0779318809509277, "reward_std": 0.24998284876346588, "rewards/reward_fn/mean": 3.0779318809509277, "rewards/reward_fn/std": 0.2499828338623047, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 285.96875, "completions/mean_terminated_length": 285.96875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.05144796860082741, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.026710799895226955, "learning_rate": 7.8064e-06, "loss": -0.127, "num_tokens": 22362308.0, "reward": 2.9080183506011963, "reward_std": 0.29254910349845886, "rewards/reward_fn/mean": 2.9080183506011963, "rewards/reward_fn/std": 0.29254910349845886, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 370.875, "completions/mean_terminated_length": 370.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.05155404688660231, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.0195961135905236, "learning_rate": 7.806e-06, "loss": -0.0933, "num_tokens": 22407808.0, "reward": 3.541620969772339, "reward_std": 0.730574369430542, "rewards/reward_fn/mean": 3.541620969772339, "rewards/reward_fn/std": 0.7305744290351868, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 203.96875, "completions/mean_terminated_length": 203.96875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.05166012517237722, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.02821559482254088, "learning_rate": 7.8056e-06, "loss": 0.0011, "num_tokens": 22448063.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 575.125, "completions/mean_terminated_length": 527.6129150390625, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.05176620345815212, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.014755380223505199, "learning_rate": 7.8052e-06, "loss": 0.201, "num_tokens": 22517571.0, "reward": 3.101287841796875, "reward_std": 0.8764511942863464, "rewards/reward_fn/mean": 3.101287841796875, "rewards/reward_fn/std": 0.8764511942863464, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1432.0, "completions/max_terminated_length": 1432.0, "completions/mean_length": 599.1875, "completions/mean_terminated_length": 599.1875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.051872281743927016, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.022309450898319483, "learning_rate": 7.8048e-06, "loss": 0.0696, "num_tokens": 22571081.0, "reward": 2.6842198371887207, "reward_std": 0.4956829249858856, "rewards/reward_fn/mean": 2.6842198371887207, "rewards/reward_fn/std": 0.49568289518356323, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 440.71875, "completions/mean_terminated_length": 440.71875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.05197836002970192, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.011530151590704918, "learning_rate": 7.8044e-06, "loss": -0.089, "num_tokens": 22635232.0, "reward": 2.738710641860962, "reward_std": 0.24050787091255188, "rewards/reward_fn/mean": 2.738710641860962, "rewards/reward_fn/std": 0.24050785601139069, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 503.53125, "completions/mean_terminated_length": 503.53125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.05208443831547682, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.023490125546231866, "learning_rate": 7.804e-06, "loss": 0.0417, "num_tokens": 22692849.0, "reward": 3.4488565921783447, "reward_std": 0.6377933025360107, "rewards/reward_fn/mean": 3.4488565921783447, "rewards/reward_fn/std": 0.6377933025360107, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1144.0, "completions/max_terminated_length": 1144.0, "completions/mean_length": 233.1875, "completions/mean_terminated_length": 233.1875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.05219051660125172, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.023990701185539365, "learning_rate": 7.8036e-06, "loss": -0.0747, "num_tokens": 22734231.0, "reward": 2.9281232357025146, "reward_std": 0.19461673498153687, "rewards/reward_fn/mean": 2.9281232357025146, "rewards/reward_fn/std": 0.19461672008037567, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 264.28125, "completions/mean_terminated_length": 264.28125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.05229659488702663, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.026221094885841012, "learning_rate": 7.8032e-06, "loss": 0.1836, "num_tokens": 22778592.0, "reward": 3.4923720359802246, "reward_std": 0.5916131734848022, "rewards/reward_fn/mean": 3.4923720359802246, "rewards/reward_fn/std": 0.5916131734848022, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 197.25, "completions/mean_terminated_length": 197.25, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.05240267317280153, "frac_reward_zero_std": 1.0, "grad_norm": 0.1298828125, "kl": 0.024377966998144984, "learning_rate": 7.8028e-06, "loss": 0.001, "num_tokens": 22821800.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1664.0, "completions/max_terminated_length": 1664.0, "completions/mean_length": 389.84375, "completions/mean_terminated_length": 389.84375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.052508751458576426, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.03522437275387347, "learning_rate": 7.8024e-06, "loss": 0.0624, "num_tokens": 22870019.0, "reward": 3.2021644115448, "reward_std": 0.43235763907432556, "rewards/reward_fn/mean": 3.2021644115448, "rewards/reward_fn/std": 0.4323575794696808, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 292.8125, "completions/mean_terminated_length": 292.8125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.05261482974435133, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.01953095616772771, "learning_rate": 7.802e-06, "loss": 0.1153, "num_tokens": 22925949.0, "reward": 3.7166550159454346, "reward_std": 0.6504583358764648, "rewards/reward_fn/mean": 3.7166550159454346, "rewards/reward_fn/std": 0.6504583358764648, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 581.03125, "completions/mean_terminated_length": 581.03125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.05272090803012623, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.015155259286984801, "learning_rate": 7.8016e-06, "loss": -0.0057, "num_tokens": 22988254.0, "reward": 2.6650097370147705, "reward_std": 0.3541271388530731, "rewards/reward_fn/mean": 2.6650097370147705, "rewards/reward_fn/std": 0.3541271388530731, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 470.71875, "completions/mean_terminated_length": 470.71875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.05282698631590114, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.017473606974817812, "learning_rate": 7.801199999999999e-06, "loss": -0.0481, "num_tokens": 23038293.0, "reward": 2.4420838356018066, "reward_std": 0.5715554356575012, "rewards/reward_fn/mean": 2.4420838356018066, "rewards/reward_fn/std": 0.5715554356575012, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 338.46875, "completions/mean_terminated_length": 338.46875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.05293306460167604, "frac_reward_zero_std": 1.0, "grad_norm": 0.062255859375, "kl": 0.015421941527165473, "learning_rate": 7.8008e-06, "loss": 0.0006, "num_tokens": 23075140.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 269.1875, "completions/mean_terminated_length": 269.1875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.05303914288745094, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.01675526413600892, "learning_rate": 7.800399999999999e-06, "loss": 0.037, "num_tokens": 23126026.0, "reward": 3.928408145904541, "reward_std": 0.4049839973449707, "rewards/reward_fn/mean": 3.928408145904541, "rewards/reward_fn/std": 0.4049839973449707, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 93.0, "completions/max_terminated_length": 93.0, "completions/mean_length": 66.9375, "completions/mean_terminated_length": 66.9375, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.05314522117322584, "frac_reward_zero_std": 1.0, "grad_norm": 0.279296875, "kl": 0.026731195161119103, "learning_rate": 7.8e-06, "loss": 0.0011, "num_tokens": 23146344.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 440.5625, "completions/mean_terminated_length": 440.5625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.05325129945900074, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.018118804087862372, "learning_rate": 7.799599999999999e-06, "loss": 0.0598, "num_tokens": 23201882.0, "reward": 3.886624336242676, "reward_std": 0.4708651900291443, "rewards/reward_fn/mean": 3.886624336242676, "rewards/reward_fn/std": 0.4708651900291443, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 301.1875, "completions/mean_terminated_length": 301.1875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.05335737774477564, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.02376817143522203, "learning_rate": 7.7992e-06, "loss": -0.0487, "num_tokens": 23246368.0, "reward": 3.6897573471069336, "reward_std": 0.4686228930950165, "rewards/reward_fn/mean": 3.6897573471069336, "rewards/reward_fn/std": 0.4686228930950165, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 344.96875, "completions/mean_terminated_length": 344.96875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.05346345603055055, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.025152756366878748, "learning_rate": 7.798799999999999e-06, "loss": 0.1383, "num_tokens": 23291359.0, "reward": 2.986790657043457, "reward_std": 0.23401233553886414, "rewards/reward_fn/mean": 2.986790657043457, "rewards/reward_fn/std": 0.23401233553886414, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1796.0, "completions/max_terminated_length": 1796.0, "completions/mean_length": 484.1875, "completions/mean_terminated_length": 484.1875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.05356953431632545, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.017668452579528093, "learning_rate": 7.7984e-06, "loss": 0.0279, "num_tokens": 23355333.0, "reward": 2.622150421142578, "reward_std": 0.31195494532585144, "rewards/reward_fn/mean": 2.622150421142578, "rewards/reward_fn/std": 0.31195491552352905, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1184.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 353.625, "completions/mean_terminated_length": 353.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.05367561260210035, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.018400359200313687, "learning_rate": 7.797999999999999e-06, "loss": 0.0488, "num_tokens": 23388185.0, "reward": 3.707305431365967, "reward_std": 0.7213976383209229, "rewards/reward_fn/mean": 3.707305431365967, "rewards/reward_fn/std": 0.7213976979255676, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 230.78125, "completions/mean_terminated_length": 230.78125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.05378169088787525, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.02609437541104853, "learning_rate": 7.7976e-06, "loss": -0.0298, "num_tokens": 23435730.0, "reward": 3.6243045330047607, "reward_std": 0.5281786322593689, "rewards/reward_fn/mean": 3.6243045330047607, "rewards/reward_fn/std": 0.5281786322593689, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 315.96875, "completions/mean_terminated_length": 260.0967712402344, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.05388776917365015, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.02170586190186441, "learning_rate": 7.7972e-06, "loss": 0.3208, "num_tokens": 23483857.0, "reward": 3.8347511291503906, "reward_std": 0.7358342409133911, "rewards/reward_fn/mean": 3.8347511291503906, "rewards/reward_fn/std": 0.7358343005180359, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 293.3125, "completions/mean_terminated_length": 293.3125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.05399384745942506, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.0171760261291638, "learning_rate": 7.7968e-06, "loss": -0.0252, "num_tokens": 23537883.0, "reward": 3.928311824798584, "reward_std": 0.4055293798446655, "rewards/reward_fn/mean": 3.928311824798584, "rewards/reward_fn/std": 0.4055293798446655, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 194.53125, "completions/mean_terminated_length": 194.53125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.05409992574519996, "frac_reward_zero_std": 1.0, "grad_norm": 0.099609375, "kl": 0.025215985951945186, "learning_rate": 7.7964e-06, "loss": 0.001, "num_tokens": 23567724.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1246.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 224.09375, "completions/mean_terminated_length": 224.09375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.05420600403097486, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.025873176753520966, "learning_rate": 7.796e-06, "loss": -0.0319, "num_tokens": 23617359.0, "reward": 2.72564435005188, "reward_std": 0.18606720864772797, "rewards/reward_fn/mean": 2.72564435005188, "rewards/reward_fn/std": 0.18606719374656677, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 221.46875, "completions/mean_terminated_length": 221.46875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.05431208231674976, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.02119889738969505, "learning_rate": 7.7956e-06, "loss": 0.0008, "num_tokens": 23674782.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 275.71875, "completions/mean_terminated_length": 275.71875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.05441816060252466, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.019600946456193924, "learning_rate": 7.7952e-06, "loss": 0.0119, "num_tokens": 23716373.0, "reward": 3.9173965454101562, "reward_std": 0.3250616788864136, "rewards/reward_fn/mean": 3.9173965454101562, "rewards/reward_fn/std": 0.3250616490840912, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 199.875, "completions/mean_terminated_length": 199.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.05452423888829957, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.023369343602098525, "learning_rate": 7.7948e-06, "loss": 0.0009, "num_tokens": 23751569.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 144.625, "completions/mean_terminated_length": 144.625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.05463031717407447, "frac_reward_zero_std": 1.0, "grad_norm": 0.11083984375, "kl": 0.024780564941465855, "learning_rate": 7.7944e-06, "loss": 0.001, "num_tokens": 23795365.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1589.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 482.34375, "completions/mean_terminated_length": 482.34375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.05473639545984937, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.01738837524317205, "learning_rate": 7.793999999999999e-06, "loss": 0.0892, "num_tokens": 23869648.0, "reward": 3.2161927223205566, "reward_std": 0.9148228764533997, "rewards/reward_fn/mean": 3.2161927223205566, "rewards/reward_fn/std": 0.9148228764533997, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 371.25, "completions/mean_terminated_length": 371.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.05484247374562427, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.02530662529170513, "learning_rate": 7.7936e-06, "loss": -0.0051, "num_tokens": 23911960.0, "reward": 2.4564008712768555, "reward_std": 0.7786185145378113, "rewards/reward_fn/mean": 2.4564008712768555, "rewards/reward_fn/std": 0.7786185145378113, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 99.0625, "completions/mean_terminated_length": 99.0625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.05494855203139917, "frac_reward_zero_std": 1.0, "grad_norm": 0.203125, "kl": 0.025294956751167774, "learning_rate": 7.793199999999999e-06, "loss": 0.001, "num_tokens": 23950170.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 156.125, "completions/mean_terminated_length": 156.125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.05505463031717407, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.01991723431274295, "learning_rate": 7.7928e-06, "loss": 0.0355, "num_tokens": 23986654.0, "reward": 3.9727840423583984, "reward_std": 0.15395739674568176, "rewards/reward_fn/mean": 3.9727840423583984, "rewards/reward_fn/std": 0.15395741164684296, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 80.59375, "completions/mean_terminated_length": 80.59375, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.05516070860294898, "frac_reward_zero_std": 1.0, "grad_norm": 0.2041015625, "kl": 0.02751685946714133, "learning_rate": 7.7924e-06, "loss": 0.0011, "num_tokens": 24022641.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 288.15625, "completions/mean_terminated_length": 288.15625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.05526678688872388, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.01649426226504147, "learning_rate": 7.792e-06, "loss": 0.0007, "num_tokens": 24068726.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 201.03125, "completions/mean_terminated_length": 201.03125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.05537286517449878, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.024215756682679057, "learning_rate": 7.7916e-06, "loss": -0.0523, "num_tokens": 24125495.0, "reward": 3.6382832527160645, "reward_std": 0.476779580116272, "rewards/reward_fn/mean": 3.6382832527160645, "rewards/reward_fn/std": 0.476779580116272, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 372.78125, "completions/mean_terminated_length": 372.78125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.05547894346027368, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.018172105425037444, "learning_rate": 7.7912e-06, "loss": 0.0154, "num_tokens": 24186032.0, "reward": 2.670097589492798, "reward_std": 0.18027065694332123, "rewards/reward_fn/mean": 2.670097589492798, "rewards/reward_fn/std": 0.18027064204216003, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 239.28125, "completions/mean_terminated_length": 239.28125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.05558502174604858, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.024925056844949722, "learning_rate": 7.7908e-06, "loss": 0.0018, "num_tokens": 24209209.0, "reward": 3.8195762634277344, "reward_std": 0.5922850966453552, "rewards/reward_fn/mean": 3.8195762634277344, "rewards/reward_fn/std": 0.5922850966453552, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 222.53125, "completions/mean_terminated_length": 222.53125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.05569110003182349, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.025476337876170874, "learning_rate": 7.790399999999999e-06, "loss": -0.0391, "num_tokens": 24253034.0, "reward": 3.9297561645507812, "reward_std": 0.2772708833217621, "rewards/reward_fn/mean": 3.9297561645507812, "rewards/reward_fn/std": 0.2772708535194397, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 267.0625, "completions/mean_terminated_length": 267.0625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.05579717831759839, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.021522009512409568, "learning_rate": 7.79e-06, "loss": -0.0187, "num_tokens": 24294476.0, "reward": 2.7246017456054688, "reward_std": 0.24783332645893097, "rewards/reward_fn/mean": 2.7246017456054688, "rewards/reward_fn/std": 0.24783335626125336, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 333.125, "completions/mean_terminated_length": 333.125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.05590325660337329, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.023421776480972767, "learning_rate": 7.789599999999999e-06, "loss": 0.1981, "num_tokens": 24369616.0, "reward": 3.8314743041992188, "reward_std": 0.5596181154251099, "rewards/reward_fn/mean": 3.8314743041992188, "rewards/reward_fn/std": 0.5596181154251099, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 336.9375, "completions/mean_terminated_length": 336.9375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.056009334889148193, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.018540258635766804, "learning_rate": 7.7892e-06, "loss": -0.0174, "num_tokens": 24426990.0, "reward": 3.9209604263305664, "reward_std": 0.3111303448677063, "rewards/reward_fn/mean": 3.9209604263305664, "rewards/reward_fn/std": 0.3111303448677063, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 424.53125, "completions/mean_terminated_length": 424.53125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.05611541317492309, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.02311969199217856, "learning_rate": 7.788799999999999e-06, "loss": 0.0972, "num_tokens": 24479199.0, "reward": 2.616727828979492, "reward_std": 0.3706587255001068, "rewards/reward_fn/mean": 2.616727828979492, "rewards/reward_fn/std": 0.3706587553024292, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1286.0, "completions/max_terminated_length": 1286.0, "completions/mean_length": 363.96875, "completions/mean_terminated_length": 363.96875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.05622149146069799, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.018128991941921413, "learning_rate": 7.7884e-06, "loss": -0.0755, "num_tokens": 24525630.0, "reward": 2.7659575939178467, "reward_std": 0.31296306848526, "rewards/reward_fn/mean": 2.7659575939178467, "rewards/reward_fn/std": 0.3129630386829376, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 256.28125, "completions/mean_terminated_length": 256.28125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.0563275697464729, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.02169134363066405, "learning_rate": 7.788e-06, "loss": 0.0009, "num_tokens": 24582535.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1121.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 280.75, "completions/mean_terminated_length": 280.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.0564336480322478, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.022156615275889635, "learning_rate": 7.7876e-06, "loss": -0.0788, "num_tokens": 24625151.0, "reward": 3.9418551921844482, "reward_std": 0.22879938781261444, "rewards/reward_fn/mean": 3.9418551921844482, "rewards/reward_fn/std": 0.22879941761493683, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 192.71875, "completions/mean_terminated_length": 192.71875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.056539726318022704, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.01747878734022379, "learning_rate": 7.7872e-06, "loss": -0.0088, "num_tokens": 24663670.0, "reward": 2.927187442779541, "reward_std": 0.05285609886050224, "rewards/reward_fn/mean": 2.927187442779541, "rewards/reward_fn/std": 0.05285611376166344, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 286.15625, "completions/mean_terminated_length": 286.15625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.0566458046037976, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.021938928868621588, "learning_rate": 7.7868e-06, "loss": -0.0154, "num_tokens": 24693819.0, "reward": 2.5903778076171875, "reward_std": 0.4148198962211609, "rewards/reward_fn/mean": 2.5903778076171875, "rewards/reward_fn/std": 0.4148198962211609, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 176.0, "completions/mean_terminated_length": 176.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.0567518828895725, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.026313411304727197, "learning_rate": 7.7864e-06, "loss": -0.043, "num_tokens": 24727163.0, "reward": 3.9305410385131836, "reward_std": 0.3929198384284973, "rewards/reward_fn/mean": 3.9305410385131836, "rewards/reward_fn/std": 0.3929198086261749, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 450.96875, "completions/mean_terminated_length": 399.45159912109375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.05685796117534741, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.02474389597773552, "learning_rate": 7.786e-06, "loss": 0.295, "num_tokens": 24807642.0, "reward": 2.4844565391540527, "reward_std": 0.7617939710617065, "rewards/reward_fn/mean": 2.4844565391540527, "rewards/reward_fn/std": 0.7617940306663513, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 279.125, "completions/mean_terminated_length": 279.125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.05696403946112231, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.023733343929052353, "learning_rate": 7.785599999999999e-06, "loss": 0.2354, "num_tokens": 24864926.0, "reward": 3.690593719482422, "reward_std": 0.6516405344009399, "rewards/reward_fn/mean": 3.690593719482422, "rewards/reward_fn/std": 0.6516405344009399, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1344.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 249.25, "completions/mean_terminated_length": 249.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.05707011774689721, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.01965078490320593, "learning_rate": 7.7852e-06, "loss": 0.0008, "num_tokens": 24893702.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 251.375, "completions/mean_terminated_length": 251.375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.057176196032672114, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.0240073436871171, "learning_rate": 7.784799999999999e-06, "loss": -0.0035, "num_tokens": 24948466.0, "reward": 3.0717613697052, "reward_std": 0.3055979013442993, "rewards/reward_fn/mean": 3.0717613697052, "rewards/reward_fn/std": 0.3055979013442993, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 420.9375, "completions/mean_terminated_length": 420.9375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.05728227431844701, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.017085597850382328, "learning_rate": 7.7844e-06, "loss": -0.0404, "num_tokens": 24999024.0, "reward": 3.8227782249450684, "reward_std": 0.5049712657928467, "rewards/reward_fn/mean": 3.8227782249450684, "rewards/reward_fn/std": 0.5049712061882019, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 429.375, "completions/mean_terminated_length": 377.1612854003906, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.05738835260422192, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.017597553320229053, "learning_rate": 7.783999999999999e-06, "loss": 0.2524, "num_tokens": 25047324.0, "reward": 3.8369994163513184, "reward_std": 0.7323954701423645, "rewards/reward_fn/mean": 3.8369994163513184, "rewards/reward_fn/std": 0.7323954105377197, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1177.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 378.90625, "completions/mean_terminated_length": 378.90625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.05749443088999682, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.016290104715153575, "learning_rate": 7.7836e-06, "loss": 0.0425, "num_tokens": 25098361.0, "reward": 3.7207727432250977, "reward_std": 0.5368377566337585, "rewards/reward_fn/mean": 3.7207727432250977, "rewards/reward_fn/std": 0.5368378162384033, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 524.0, "completions/mean_terminated_length": 474.83868408203125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.05760050917577172, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.015481176087632775, "learning_rate": 7.7832e-06, "loss": 0.2563, "num_tokens": 25159897.0, "reward": 3.875, "reward_std": 0.7071067690849304, "rewards/reward_fn/mean": 3.875, "rewards/reward_fn/std": 0.7071067690849304, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1671.0, "completions/max_terminated_length": 1671.0, "completions/mean_length": 397.84375, "completions/mean_terminated_length": 397.84375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.057706587461546624, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.020055697415955365, "learning_rate": 7.7828e-06, "loss": 0.0396, "num_tokens": 25186612.0, "reward": 3.4372525215148926, "reward_std": 0.9111435413360596, "rewards/reward_fn/mean": 3.4372525215148926, "rewards/reward_fn/std": 0.9111434817314148, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1547.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 402.5625, "completions/mean_terminated_length": 402.5625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.05781266574732152, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.01703974965494126, "learning_rate": 7.7824e-06, "loss": -0.0174, "num_tokens": 25245926.0, "reward": 2.1865077018737793, "reward_std": 0.591599702835083, "rewards/reward_fn/mean": 2.1865077018737793, "rewards/reward_fn/std": 0.5915996432304382, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 129.03125, "completions/mean_terminated_length": 129.03125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.05791874403309642, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.021547950338572264, "learning_rate": 7.782e-06, "loss": 0.0009, "num_tokens": 25293607.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 135.34375, "completions/mean_terminated_length": 135.34375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.05802482231887133, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.02250622259452939, "learning_rate": 7.7816e-06, "loss": 0.0009, "num_tokens": 25337234.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 438.40625, "completions/mean_terminated_length": 438.40625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.05813090060464623, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.016708286479115486, "learning_rate": 7.7812e-06, "loss": -0.0201, "num_tokens": 25370719.0, "reward": 3.6040561199188232, "reward_std": 0.6672210693359375, "rewards/reward_fn/mean": 3.6040561199188232, "rewards/reward_fn/std": 0.6672210693359375, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 275.15625, "completions/mean_terminated_length": 275.15625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.05823697889042113, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.022039557108655572, "learning_rate": 7.7808e-06, "loss": -0.0129, "num_tokens": 25399588.0, "reward": 3.9713706970214844, "reward_std": 0.1619519293308258, "rewards/reward_fn/mean": 3.9713706970214844, "rewards/reward_fn/std": 0.1619519144296646, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 251.59375, "completions/mean_terminated_length": 251.59375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.058343057176196034, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.025676261633634567, "learning_rate": 7.7804e-06, "loss": 0.0033, "num_tokens": 25442935.0, "reward": 3.9251885414123535, "reward_std": 0.42319679260253906, "rewards/reward_fn/mean": 3.9251885414123535, "rewards/reward_fn/std": 0.4231967329978943, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 318.1875, "completions/mean_terminated_length": 318.1875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.05844913546197093, "frac_reward_zero_std": 1.0, "grad_norm": 0.06640625, "kl": 0.01699168875347823, "learning_rate": 7.78e-06, "loss": 0.0007, "num_tokens": 25508861.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 216.03125, "completions/mean_terminated_length": 216.03125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.05855521374774584, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.023042756598442793, "learning_rate": 7.7796e-06, "loss": -0.101, "num_tokens": 25546814.0, "reward": 2.9139137268066406, "reward_std": 0.2089153379201889, "rewards/reward_fn/mean": 2.9139137268066406, "rewards/reward_fn/std": 0.2089153528213501, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 324.71875, "completions/mean_terminated_length": 324.71875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.05866129203352074, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.021038109436631203, "learning_rate": 7.7792e-06, "loss": 0.0099, "num_tokens": 25596405.0, "reward": 2.8648502826690674, "reward_std": 0.312248170375824, "rewards/reward_fn/mean": 2.8648502826690674, "rewards/reward_fn/std": 0.312248170375824, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 406.96875, "completions/mean_terminated_length": 354.0322570800781, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.05876737031929564, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.027637179009616375, "learning_rate": 7.7788e-06, "loss": 0.0696, "num_tokens": 25645524.0, "reward": 3.4637341499328613, "reward_std": 0.9320629239082336, "rewards/reward_fn/mean": 3.4637341499328613, "rewards/reward_fn/std": 0.9320629239082336, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1251.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 337.375, "completions/mean_terminated_length": 337.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.058873448605070544, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.02197849005460739, "learning_rate": 7.7784e-06, "loss": -0.0888, "num_tokens": 25689728.0, "reward": 3.8957581520080566, "reward_std": 0.3294965326786041, "rewards/reward_fn/mean": 3.8957581520080566, "rewards/reward_fn/std": 0.3294965624809265, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 230.8125, "completions/mean_terminated_length": 230.8125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.05897952689084544, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.029500858625397086, "learning_rate": 7.777999999999999e-06, "loss": 0.0233, "num_tokens": 25739002.0, "reward": 2.866637706756592, "reward_std": 0.37486761808395386, "rewards/reward_fn/mean": 2.866637706756592, "rewards/reward_fn/std": 0.37486761808395386, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 974.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 222.09375, "completions/mean_terminated_length": 222.09375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.05908560517662034, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.02057086571585387, "learning_rate": 7.7776e-06, "loss": 0.0235, "num_tokens": 25777245.0, "reward": 2.855576515197754, "reward_std": 0.3794987201690674, "rewards/reward_fn/mean": 2.855576515197754, "rewards/reward_fn/std": 0.37949874997138977, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 306.65625, "completions/mean_terminated_length": 306.65625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.05919168346239525, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.02790513075888157, "learning_rate": 7.777199999999999e-06, "loss": 0.0267, "num_tokens": 25825586.0, "reward": 3.3989100456237793, "reward_std": 0.577396810054779, "rewards/reward_fn/mean": 3.3989100456237793, "rewards/reward_fn/std": 0.5773967504501343, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 306.375, "completions/mean_terminated_length": 306.375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.05929776174817015, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.02001808863133192, "learning_rate": 7.7768e-06, "loss": 0.0595, "num_tokens": 25870590.0, "reward": 3.9636728763580322, "reward_std": 0.2054968923330307, "rewards/reward_fn/mean": 3.9636728763580322, "rewards/reward_fn/std": 0.2054968625307083, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 199.78125, "completions/mean_terminated_length": 199.78125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.059403840033945055, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.022585346596315503, "learning_rate": 7.776399999999999e-06, "loss": 0.0009, "num_tokens": 25911831.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.059509918319719954, "frac_reward_zero_std": 1.0, "grad_norm": 0.1220703125, "kl": 0.029487166553735733, "learning_rate": 7.776e-06, "loss": 0.0012, "num_tokens": 25958847.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 212.40625, "completions/mean_terminated_length": 212.40625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.05961599660549485, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.028197331819683313, "learning_rate": 7.775599999999999e-06, "loss": 0.0011, "num_tokens": 26001964.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1773.0, "completions/max_terminated_length": 1773.0, "completions/mean_length": 312.15625, "completions/mean_terminated_length": 312.15625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.05972207489126976, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.030213134363293648, "learning_rate": 7.7752e-06, "loss": -0.1456, "num_tokens": 26043633.0, "reward": 3.2193734645843506, "reward_std": 0.7501698732376099, "rewards/reward_fn/mean": 3.2193734645843506, "rewards/reward_fn/std": 0.7501698136329651, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 234.03125, "completions/mean_terminated_length": 234.03125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.05982815317704466, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.0262491125613451, "learning_rate": 7.774799999999999e-06, "loss": -0.0273, "num_tokens": 26075090.0, "reward": 3.796114921569824, "reward_std": 0.5917590260505676, "rewards/reward_fn/mean": 3.796114921569824, "rewards/reward_fn/std": 0.5917590260505676, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 260.59375, "completions/mean_terminated_length": 260.59375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.05993423146281956, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.02379171922802925, "learning_rate": 7.7744e-06, "loss": 0.0414, "num_tokens": 26121669.0, "reward": 3.966832160949707, "reward_std": 0.18762588500976562, "rewards/reward_fn/mean": 3.966832160949707, "rewards/reward_fn/std": 0.18762588500976562, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1673.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 340.25, "completions/mean_terminated_length": 340.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.060040309748594464, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.023976900847628713, "learning_rate": 7.774e-06, "loss": 0.1362, "num_tokens": 26176941.0, "reward": 3.870932102203369, "reward_std": 0.3512914180755615, "rewards/reward_fn/mean": 3.870932102203369, "rewards/reward_fn/std": 0.3512914478778839, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 401.3125, "completions/mean_terminated_length": 348.19354248046875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.060146388034369364, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.02290424262173474, "learning_rate": 7.7736e-06, "loss": 0.3008, "num_tokens": 26230135.0, "reward": 2.7015156745910645, "reward_std": 0.5482509732246399, "rewards/reward_fn/mean": 2.7015156745910645, "rewards/reward_fn/std": 0.5482509732246399, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1626.0, "completions/max_terminated_length": 1626.0, "completions/mean_length": 386.40625, "completions/mean_terminated_length": 386.40625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.06025246632014427, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.022990726167336106, "learning_rate": 7.7732e-06, "loss": 0.0198, "num_tokens": 26281284.0, "reward": 3.185511589050293, "reward_std": 0.48132088780403137, "rewards/reward_fn/mean": 3.185511589050293, "rewards/reward_fn/std": 0.4813208281993866, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 221.375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.06035854460591917, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.02692016214132309, "learning_rate": 7.7728e-06, "loss": -0.0111, "num_tokens": 26331024.0, "reward": 3.2124645709991455, "reward_std": 1.0723460912704468, "rewards/reward_fn/mean": 3.2124645709991455, "rewards/reward_fn/std": 1.0723460912704468, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 291.9375, "completions/mean_terminated_length": 291.9375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.06046462289169407, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.016868799226358533, "learning_rate": 7.7724e-06, "loss": 0.0007, "num_tokens": 26383566.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1654.0, "completions/max_terminated_length": 1654.0, "completions/mean_length": 541.6875, "completions/mean_terminated_length": 541.6875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.060570701177468975, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.021761097013950348, "learning_rate": 7.772e-06, "loss": 0.0355, "num_tokens": 26440228.0, "reward": 2.8173985481262207, "reward_std": 0.04130704700946808, "rewards/reward_fn/mean": 2.8173985481262207, "rewards/reward_fn/std": 0.04130704328417778, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1903.0, "completions/mean_length": 743.0625, "completions/mean_terminated_length": 656.0667114257812, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "epoch": 0.060676779463243874, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.02275507105514407, "learning_rate": 7.7716e-06, "loss": 0.2517, "num_tokens": 26500774.0, "reward": 2.5745859146118164, "reward_std": 0.7353212237358093, "rewards/reward_fn/mean": 2.5745859146118164, "rewards/reward_fn/std": 0.7353212237358093, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 131.0, "completions/mean_terminated_length": 131.0, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.06078285774901877, "frac_reward_zero_std": 1.0, "grad_norm": 0.1826171875, "kl": 0.0287727911490947, "learning_rate": 7.7712e-06, "loss": 0.0012, "num_tokens": 26539238.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 416.3125, "completions/mean_terminated_length": 416.3125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.06088893603479368, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.01945284497924149, "learning_rate": 7.7708e-06, "loss": 0.0562, "num_tokens": 26596496.0, "reward": 2.86415958404541, "reward_std": 0.027712536975741386, "rewards/reward_fn/mean": 2.86415958404541, "rewards/reward_fn/std": 0.027712490409612656, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 234.21875, "completions/mean_terminated_length": 234.21875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.06099501432056858, "frac_reward_zero_std": 1.0, "grad_norm": 0.09765625, "kl": 0.02484210953116417, "learning_rate": 7.7704e-06, "loss": 0.001, "num_tokens": 26638839.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 204.0, "completions/max_terminated_length": 204.0, "completions/mean_length": 148.0625, "completions/mean_terminated_length": 148.0625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.06110109260634348, "frac_reward_zero_std": 1.0, "grad_norm": 0.1064453125, "kl": 0.02255557058379054, "learning_rate": 7.769999999999998e-06, "loss": 0.0009, "num_tokens": 26686265.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 245.84375, "completions/mean_terminated_length": 245.84375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.061207170892118384, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.02338914154097438, "learning_rate": 7.7696e-06, "loss": 0.0526, "num_tokens": 26739572.0, "reward": 3.0910239219665527, "reward_std": 0.6024491190910339, "rewards/reward_fn/mean": 3.0910239219665527, "rewards/reward_fn/std": 0.6024490594863892, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 286.59375, "completions/mean_terminated_length": 286.59375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.061313249177893284, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.024446885101497173, "learning_rate": 7.7692e-06, "loss": 0.0939, "num_tokens": 26780743.0, "reward": 3.1179909706115723, "reward_std": 0.3886381685733795, "rewards/reward_fn/mean": 3.1179909706115723, "rewards/reward_fn/std": 0.38863810896873474, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 163.4375, "completions/mean_terminated_length": 163.4375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.06141932746366819, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.03373258630745113, "learning_rate": 7.7688e-06, "loss": 0.0155, "num_tokens": 26818709.0, "reward": 3.9367802143096924, "reward_std": 0.24996694922447205, "rewards/reward_fn/mean": 3.9367802143096924, "rewards/reward_fn/std": 0.24996691942214966, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 298.34375, "completions/mean_terminated_length": 298.34375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.06152540574944309, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.025001312140375376, "learning_rate": 7.7684e-06, "loss": 0.0323, "num_tokens": 26876544.0, "reward": 2.8969321250915527, "reward_std": 0.41512611508369446, "rewards/reward_fn/mean": 2.8969321250915527, "rewards/reward_fn/std": 0.41512614488601685, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 207.96875, "completions/mean_terminated_length": 207.96875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.06163148403521799, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.02601949847303331, "learning_rate": 7.767999999999999e-06, "loss": 0.0264, "num_tokens": 26914719.0, "reward": 3.91837477684021, "reward_std": 0.3212246596813202, "rewards/reward_fn/mean": 3.91837477684021, "rewards/reward_fn/std": 0.3212246298789978, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 251.75, "completions/mean_terminated_length": 251.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.061737562320992895, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.02899379818700254, "learning_rate": 7.7676e-06, "loss": 0.0021, "num_tokens": 26951543.0, "reward": 1.9556413888931274, "reward_std": 0.4255604147911072, "rewards/reward_fn/mean": 1.9556413888931274, "rewards/reward_fn/std": 0.4255603849887848, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 161.90625, "completions/mean_terminated_length": 161.90625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.061843640606767794, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.028046605177223682, "learning_rate": 7.767199999999999e-06, "loss": 0.0011, "num_tokens": 26986676.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 329.0625, "completions/mean_terminated_length": 329.0625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.06194971889254269, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.02596241678111255, "learning_rate": 7.7668e-06, "loss": 0.0319, "num_tokens": 27038422.0, "reward": 2.7040181159973145, "reward_std": 0.4574950635433197, "rewards/reward_fn/mean": 2.7040181159973145, "rewards/reward_fn/std": 0.4574950933456421, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1041.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 310.34375, "completions/mean_terminated_length": 310.34375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.0620557971783176, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.02197631192393601, "learning_rate": 7.766399999999999e-06, "loss": -0.0528, "num_tokens": 27092321.0, "reward": 3.9704360961914062, "reward_std": 0.16723935306072235, "rewards/reward_fn/mean": 3.9704360961914062, "rewards/reward_fn/std": 0.16723932325839996, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1077.0, "completions/max_terminated_length": 1077.0, "completions/mean_length": 256.3125, "completions/mean_terminated_length": 256.3125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.0621618754640925, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.025298673193901777, "learning_rate": 7.766e-06, "loss": 0.034, "num_tokens": 27120971.0, "reward": 3.0783987045288086, "reward_std": 0.49813732504844666, "rewards/reward_fn/mean": 3.0783987045288086, "rewards/reward_fn/std": 0.49813729524612427, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 472.0625, "completions/mean_terminated_length": 472.0625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.062267953749867405, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.022866470273584127, "learning_rate": 7.765599999999999e-06, "loss": -0.0309, "num_tokens": 27166509.0, "reward": 2.5862374305725098, "reward_std": 0.24781370162963867, "rewards/reward_fn/mean": 2.5862374305725098, "rewards/reward_fn/std": 0.24781371653079987, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1359.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 334.09375, "completions/mean_terminated_length": 334.09375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.062374032035642304, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.024537658086046576, "learning_rate": 7.7652e-06, "loss": 0.0017, "num_tokens": 27215664.0, "reward": 3.96297287940979, "reward_std": 0.2094566822052002, "rewards/reward_fn/mean": 3.96297287940979, "rewards/reward_fn/std": 0.2094566524028778, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 236.375, "completions/mean_terminated_length": 236.375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.062480110321417204, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.05280748289078474, "learning_rate": 7.7648e-06, "loss": 0.0752, "num_tokens": 27262844.0, "reward": 3.666501998901367, "reward_std": 0.5441361665725708, "rewards/reward_fn/mean": 3.666501998901367, "rewards/reward_fn/std": 0.544136106967926, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 159.875, "completions/mean_terminated_length": 159.875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.0625861886071921, "frac_reward_zero_std": 1.0, "grad_norm": 0.12255859375, "kl": 0.029549932572990656, "learning_rate": 7.7644e-06, "loss": 0.0012, "num_tokens": 27303352.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 242.8125, "completions/mean_terminated_length": 242.8125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.06269226689296702, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.026474663987755775, "learning_rate": 7.764e-06, "loss": 0.0806, "num_tokens": 27345042.0, "reward": 3.7683963775634766, "reward_std": 0.4177395701408386, "rewards/reward_fn/mean": 3.7683963775634766, "rewards/reward_fn/std": 0.417739599943161, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 192.46875, "completions/mean_terminated_length": 192.46875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.06279834517874192, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.02718471735715866, "learning_rate": 7.7636e-06, "loss": 0.0011, "num_tokens": 27389153.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 217.40625, "completions/mean_terminated_length": 217.40625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.06290442346451681, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.02357006026431918, "learning_rate": 7.7632e-06, "loss": 0.0768, "num_tokens": 27444366.0, "reward": 3.6223254203796387, "reward_std": 0.6039735674858093, "rewards/reward_fn/mean": 3.6223254203796387, "rewards/reward_fn/std": 0.6039735674858093, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 224.1875, "completions/mean_terminated_length": 224.1875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.06301050175029171, "frac_reward_zero_std": 1.0, "grad_norm": 0.1640625, "kl": 0.032485876930877566, "learning_rate": 7.7628e-06, "loss": 0.0013, "num_tokens": 27489652.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 370.84375, "completions/mean_terminated_length": 316.7419128417969, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.06311658003606661, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.021101591642946005, "learning_rate": 7.7624e-06, "loss": 0.3438, "num_tokens": 27522351.0, "reward": 3.6290016174316406, "reward_std": 0.8651793599128723, "rewards/reward_fn/mean": 3.6290016174316406, "rewards/reward_fn/std": 0.8651794195175171, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 461.3125, "completions/mean_terminated_length": 461.3125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.06322265832184151, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.020340920658782125, "learning_rate": 7.762e-06, "loss": 0.1006, "num_tokens": 27556665.0, "reward": 2.6203527450561523, "reward_std": 0.41824400424957275, "rewards/reward_fn/mean": 2.6203527450561523, "rewards/reward_fn/std": 0.41824400424957275, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1311.0, "completions/max_terminated_length": 1311.0, "completions/mean_length": 362.3125, "completions/mean_terminated_length": 362.3125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.06332873660761643, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.031615799525752664, "learning_rate": 7.761599999999999e-06, "loss": 0.1534, "num_tokens": 27603619.0, "reward": 3.4667482376098633, "reward_std": 0.7769173979759216, "rewards/reward_fn/mean": 3.4667482376098633, "rewards/reward_fn/std": 0.7769173979759216, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 298.5, "completions/mean_terminated_length": 298.5, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.06343481489339133, "frac_reward_zero_std": 1.0, "grad_norm": 0.0595703125, "kl": 0.01625680283177644, "learning_rate": 7.7612e-06, "loss": 0.0007, "num_tokens": 27658291.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 368.125, "completions/mean_terminated_length": 368.125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.06354089317916622, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.022277246927842498, "learning_rate": 7.760799999999999e-06, "loss": 0.1759, "num_tokens": 27705591.0, "reward": 2.9454853534698486, "reward_std": 1.09701406955719, "rewards/reward_fn/mean": 2.9454853534698486, "rewards/reward_fn/std": 1.0970139503479004, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1161.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 572.25, "completions/mean_terminated_length": 572.25, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.06364697146494112, "frac_reward_zero_std": 0.0, "grad_norm": 1.09375, "kl": 0.01797162415459752, "learning_rate": 7.7604e-06, "loss": 0.0005, "num_tokens": 27773599.0, "reward": 2.691895008087158, "reward_std": 0.3327391445636749, "rewards/reward_fn/mean": 2.691895008087158, "rewards/reward_fn/std": 0.33273911476135254, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1965.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 387.9375, "completions/mean_terminated_length": 387.9375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.06375304975071602, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.01835179328918457, "learning_rate": 7.76e-06, "loss": -0.1032, "num_tokens": 27830749.0, "reward": 2.8080360889434814, "reward_std": 0.028673529624938965, "rewards/reward_fn/mean": 2.8080360889434814, "rewards/reward_fn/std": 0.02867353893816471, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1102.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 308.03125, "completions/mean_terminated_length": 308.03125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.06385912803649094, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.020648242440074682, "learning_rate": 7.7596e-06, "loss": 0.0008, "num_tokens": 27880414.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 640.15625, "completions/mean_terminated_length": 546.300048828125, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.06396520632226584, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.023265723371878266, "learning_rate": 7.7592e-06, "loss": 0.0666, "num_tokens": 27935683.0, "reward": 2.6192939281463623, "reward_std": 0.8203719258308411, "rewards/reward_fn/mean": 2.6192939281463623, "rewards/reward_fn/std": 0.8203719258308411, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 339.875, "completions/mean_terminated_length": 339.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.06407128460804073, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.027694092597812414, "learning_rate": 7.7588e-06, "loss": -0.0216, "num_tokens": 27960351.0, "reward": 3.7123684883117676, "reward_std": 0.5121208429336548, "rewards/reward_fn/mean": 3.7123684883117676, "rewards/reward_fn/std": 0.5121208429336548, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 222.34375, "completions/mean_terminated_length": 222.34375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.06417736289381563, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.02771115326322615, "learning_rate": 7.7584e-06, "loss": -0.0306, "num_tokens": 28002954.0, "reward": 3.9631314277648926, "reward_std": 0.2085607498884201, "rewards/reward_fn/mean": 3.9631314277648926, "rewards/reward_fn/std": 0.2085607349872589, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1230.0, "completions/max_terminated_length": 1230.0, "completions/mean_length": 297.0, "completions/mean_terminated_length": 297.0, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.06428344117959053, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.02708641323260963, "learning_rate": 7.758e-06, "loss": 0.0956, "num_tokens": 28045898.0, "reward": 2.7624735832214355, "reward_std": 0.34915468096733093, "rewards/reward_fn/mean": 2.7624735832214355, "rewards/reward_fn/std": 0.34915465116500854, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 344.78125, "completions/mean_terminated_length": 344.78125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.06438951946536543, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.01575366989709437, "learning_rate": 7.7576e-06, "loss": -0.054, "num_tokens": 28096035.0, "reward": 3.480724811553955, "reward_std": 0.637944221496582, "rewards/reward_fn/mean": 3.480724811553955, "rewards/reward_fn/std": 0.6379441618919373, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1160.0, "completions/max_terminated_length": 1160.0, "completions/mean_length": 303.4375, "completions/mean_terminated_length": 303.4375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.06449559775114035, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.019724218058399856, "learning_rate": 7.7572e-06, "loss": 0.1051, "num_tokens": 28140113.0, "reward": 3.8815999031066895, "reward_std": 0.32930222153663635, "rewards/reward_fn/mean": 3.8815999031066895, "rewards/reward_fn/std": 0.32930222153663635, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1501.0, "completions/max_terminated_length": 1501.0, "completions/mean_length": 279.59375, "completions/mean_terminated_length": 279.59375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.06460167603691525, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.023657136596739292, "learning_rate": 7.7568e-06, "loss": 0.0655, "num_tokens": 28179076.0, "reward": 3.6588215827941895, "reward_std": 0.5550169944763184, "rewards/reward_fn/mean": 3.6588215827941895, "rewards/reward_fn/std": 0.5550169944763184, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 315.9375, "completions/mean_terminated_length": 315.9375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.06470775432269014, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.020349433412775397, "learning_rate": 7.756399999999999e-06, "loss": 0.0056, "num_tokens": 28236546.0, "reward": 1.650451898574829, "reward_std": 0.03633672744035721, "rewards/reward_fn/mean": 1.650451898574829, "rewards/reward_fn/std": 0.0363367535173893, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 203.28125, "completions/mean_terminated_length": 203.28125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.06481383260846504, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.01983096171170473, "learning_rate": 7.756e-06, "loss": 0.0008, "num_tokens": 28277451.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 290.1875, "completions/mean_terminated_length": 290.1875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.06491991089423994, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.023188997758552432, "learning_rate": 7.7556e-06, "loss": -0.0779, "num_tokens": 28319761.0, "reward": 3.2103209495544434, "reward_std": 0.23490554094314575, "rewards/reward_fn/mean": 3.2103209495544434, "rewards/reward_fn/std": 0.23490552604198456, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 269.1875, "completions/mean_terminated_length": 269.1875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.06502598918001486, "frac_reward_zero_std": 0.0, "grad_norm": 0.8046875, "kl": 0.023811078863218427, "learning_rate": 7.7552e-06, "loss": -0.0893, "num_tokens": 28367543.0, "reward": 3.9686954021453857, "reward_std": 0.17708587646484375, "rewards/reward_fn/mean": 3.9686954021453857, "rewards/reward_fn/std": 0.17708587646484375, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 220.09375, "completions/mean_terminated_length": 220.09375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.06513206746578976, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.024617396760731936, "learning_rate": 7.7548e-06, "loss": 0.0609, "num_tokens": 28420698.0, "reward": 3.9674062728881836, "reward_std": 0.1843782216310501, "rewards/reward_fn/mean": 3.9674062728881836, "rewards/reward_fn/std": 0.1843782216310501, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 224.09375, "completions/mean_terminated_length": 224.09375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.06523814575156466, "frac_reward_zero_std": 1.0, "grad_norm": 0.09326171875, "kl": 0.0253062816336751, "learning_rate": 7.7544e-06, "loss": 0.001, "num_tokens": 28457085.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 217.34375, "completions/mean_terminated_length": 217.34375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.06534422403733955, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.02408299339003861, "learning_rate": 7.753999999999999e-06, "loss": 0.0, "num_tokens": 28502504.0, "reward": 3.666853666305542, "reward_std": 0.7510157823562622, "rewards/reward_fn/mean": 3.666853666305542, "rewards/reward_fn/std": 0.7510157823562622, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 194.34375, "completions/mean_terminated_length": 194.34375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.06545030232311445, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.01842890668194741, "learning_rate": 7.7536e-06, "loss": 0.0007, "num_tokens": 28560915.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 1078.65625, "completions/mean_terminated_length": 978.3793334960938, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.06555638060888937, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.013798431027680635, "learning_rate": 7.753199999999999e-06, "loss": 0.1245, "num_tokens": 28631080.0, "reward": 2.1499953269958496, "reward_std": 0.8221304416656494, "rewards/reward_fn/mean": 2.1499953269958496, "rewards/reward_fn/std": 0.8221304416656494, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 279.21875, "completions/mean_terminated_length": 279.21875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.06566245889466427, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.03788356087170541, "learning_rate": 7.7528e-06, "loss": 0.1088, "num_tokens": 28659407.0, "reward": 2.988480567932129, "reward_std": 0.23437552154064178, "rewards/reward_fn/mean": 2.988480567932129, "rewards/reward_fn/std": 0.23437556624412537, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 268.71875, "completions/mean_terminated_length": 268.71875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.06576853718043917, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.022709642769768834, "learning_rate": 7.752399999999999e-06, "loss": 0.1208, "num_tokens": 28697734.0, "reward": 3.431100606918335, "reward_std": 0.6189658045768738, "rewards/reward_fn/mean": 3.431100606918335, "rewards/reward_fn/std": 0.6189658641815186, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1274.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 283.0625, "completions/mean_terminated_length": 283.0625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.06587461546621406, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.02149501978419721, "learning_rate": 7.752e-06, "loss": 0.0237, "num_tokens": 28752904.0, "reward": 1.8805391788482666, "reward_std": 0.6935924291610718, "rewards/reward_fn/mean": 1.8805391788482666, "rewards/reward_fn/std": 0.6935924291610718, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 383.125, "completions/mean_terminated_length": 383.125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.06598069375198896, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.01584722870029509, "learning_rate": 7.751599999999999e-06, "loss": -0.006, "num_tokens": 28805388.0, "reward": 2.8024349212646484, "reward_std": 0.36229246854782104, "rewards/reward_fn/mean": 2.8024349212646484, "rewards/reward_fn/std": 0.36229249835014343, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1102.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 410.21875, "completions/mean_terminated_length": 410.21875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.06608677203776386, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.020291926339268684, "learning_rate": 7.7512e-06, "loss": -0.0774, "num_tokens": 28855475.0, "reward": 3.961885690689087, "reward_std": 0.2156069576740265, "rewards/reward_fn/mean": 3.961885690689087, "rewards/reward_fn/std": 0.2156069576740265, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 223.03125, "completions/mean_terminated_length": 223.03125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.06619285032353878, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.024670890532433987, "learning_rate": 7.7508e-06, "loss": 0.1838, "num_tokens": 28909012.0, "reward": 3.6656932830810547, "reward_std": 0.5034979581832886, "rewards/reward_fn/mean": 3.6656932830810547, "rewards/reward_fn/std": 0.5034979581832886, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 375.15625, "completions/mean_terminated_length": 375.15625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.06629892860931368, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.02088888338766992, "learning_rate": 7.7504e-06, "loss": -0.0578, "num_tokens": 28951257.0, "reward": 2.5856075286865234, "reward_std": 0.31954845786094666, "rewards/reward_fn/mean": 2.5856075286865234, "rewards/reward_fn/std": 0.31954848766326904, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 265.90625, "completions/mean_terminated_length": 265.90625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.06640500689508858, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.01428930729161948, "learning_rate": 7.75e-06, "loss": 0.0585, "num_tokens": 28981846.0, "reward": 3.45589542388916, "reward_std": 0.6938397288322449, "rewards/reward_fn/mean": 3.45589542388916, "rewards/reward_fn/std": 0.6938397288322449, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 955.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 257.3125, "completions/mean_terminated_length": 257.3125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.06651108518086347, "frac_reward_zero_std": 1.0, "grad_norm": 0.06201171875, "kl": 0.016050009289756417, "learning_rate": 7.7496e-06, "loss": 0.0006, "num_tokens": 29035904.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 115.21875, "completions/mean_terminated_length": 115.21875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.06661716346663837, "frac_reward_zero_std": 0.0, "grad_norm": 2.75, "kl": 0.02869252348318696, "learning_rate": 7.7492e-06, "loss": -0.0297, "num_tokens": 29077415.0, "reward": 3.737992763519287, "reward_std": 0.5032368302345276, "rewards/reward_fn/mean": 3.737992763519287, "rewards/reward_fn/std": 0.5032367706298828, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 266.6875, "completions/mean_terminated_length": 266.6875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.06672324175241329, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.01821572007611394, "learning_rate": 7.7488e-06, "loss": 0.0338, "num_tokens": 29125021.0, "reward": 2.8474972248077393, "reward_std": 0.2903161644935608, "rewards/reward_fn/mean": 2.8474972248077393, "rewards/reward_fn/std": 0.2903161942958832, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1196.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 201.375, "completions/mean_terminated_length": 201.375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.06682932003818819, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.02066903980448842, "learning_rate": 7.7484e-06, "loss": 0.0008, "num_tokens": 29171081.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 157.15625, "completions/mean_terminated_length": 157.15625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.06693539832396309, "frac_reward_zero_std": 1.0, "grad_norm": 0.0986328125, "kl": 0.018408002564683557, "learning_rate": 7.748e-06, "loss": 0.0007, "num_tokens": 29208398.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1817.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 382.125, "completions/mean_terminated_length": 382.125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.06704147660973798, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.016226678737439215, "learning_rate": 7.7476e-06, "loss": 0.1208, "num_tokens": 29263634.0, "reward": 2.820263385772705, "reward_std": 0.028630422428250313, "rewards/reward_fn/mean": 2.820263385772705, "rewards/reward_fn/std": 0.02863038145005703, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1256.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 324.71875, "completions/mean_terminated_length": 324.71875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.06714755489551288, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.02659795875661075, "learning_rate": 7.7472e-06, "loss": -0.0069, "num_tokens": 29305705.0, "reward": 3.5446953773498535, "reward_std": 0.7270760536193848, "rewards/reward_fn/mean": 3.5446953773498535, "rewards/reward_fn/std": 0.7270760536193848, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 107.5625, "completions/mean_terminated_length": 107.5625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.06725363318128778, "frac_reward_zero_std": 0.0, "grad_norm": 3.234375, "kl": 0.026664254954084754, "learning_rate": 7.7468e-06, "loss": 0.0563, "num_tokens": 29356571.0, "reward": 2.822035551071167, "reward_std": 0.03083919733762741, "rewards/reward_fn/mean": 2.822035551071167, "rewards/reward_fn/std": 0.030839232727885246, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 171.3125, "completions/mean_terminated_length": 171.3125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.0673597114670627, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.02076897514052689, "learning_rate": 7.7464e-06, "loss": 0.0008, "num_tokens": 29417093.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1839.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 315.9375, "completions/mean_terminated_length": 315.9375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.0674657897528376, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.020625132601708174, "learning_rate": 7.746e-06, "loss": -0.0231, "num_tokens": 29469347.0, "reward": 3.4893736839294434, "reward_std": 0.8306846618652344, "rewards/reward_fn/mean": 3.4893736839294434, "rewards/reward_fn/std": 0.8306846618652344, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 430.0625, "completions/mean_terminated_length": 430.0625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.0675718680386125, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.02193266712129116, "learning_rate": 7.7456e-06, "loss": 0.199, "num_tokens": 29521989.0, "reward": 3.3931772708892822, "reward_std": 0.9524803757667542, "rewards/reward_fn/mean": 3.3931772708892822, "rewards/reward_fn/std": 0.9524803757667542, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 218.59375, "completions/mean_terminated_length": 218.59375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.0676779463243874, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.01927727018482983, "learning_rate": 7.7452e-06, "loss": 0.0266, "num_tokens": 29557464.0, "reward": 3.8538365364074707, "reward_std": 0.3451085090637207, "rewards/reward_fn/mean": 3.8538365364074707, "rewards/reward_fn/std": 0.3451085090637207, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1125.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 226.625, "completions/mean_terminated_length": 226.625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.0677840246101623, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.023490537889301777, "learning_rate": 7.744799999999999e-06, "loss": 0.0009, "num_tokens": 29601612.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 169.15625, "completions/mean_terminated_length": 169.15625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.06789010289593721, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044921875, "kl": 0.022257780889049172, "learning_rate": 7.7444e-06, "loss": 0.0009, "num_tokens": 29650641.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 170.46875, "completions/mean_terminated_length": 170.46875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.0679961811817121, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.01819098659325391, "learning_rate": 7.743999999999999e-06, "loss": 0.0007, "num_tokens": 29711392.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1487.0, "completions/max_terminated_length": 1487.0, "completions/mean_length": 571.28125, "completions/mean_terminated_length": 571.28125, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.068102259467487, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.014475560979917645, "learning_rate": 7.7436e-06, "loss": 0.0193, "num_tokens": 29767849.0, "reward": 2.6820812225341797, "reward_std": 0.33843210339546204, "rewards/reward_fn/mean": 2.6820812225341797, "rewards/reward_fn/std": 0.3384321331977844, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1318.0, "completions/max_terminated_length": 1318.0, "completions/mean_length": 672.3125, "completions/mean_terminated_length": 672.3125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.0682083377532619, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.017669666092842817, "learning_rate": 7.743199999999999e-06, "loss": -0.0573, "num_tokens": 29826579.0, "reward": 2.787215232849121, "reward_std": 0.32044708728790283, "rewards/reward_fn/mean": 2.787215232849121, "rewards/reward_fn/std": 0.3204471170902252, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 122.84375, "completions/mean_terminated_length": 122.84375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.0683144160390368, "frac_reward_zero_std": 1.0, "grad_norm": 0.1015625, "kl": 0.019987852778285742, "learning_rate": 7.7428e-06, "loss": 0.0008, "num_tokens": 29863278.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 273.875, "completions/mean_terminated_length": 273.875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.06842049432481172, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.026723440503701568, "learning_rate": 7.742399999999999e-06, "loss": -0.045, "num_tokens": 29911210.0, "reward": 3.9678614139556885, "reward_std": 0.18180328607559204, "rewards/reward_fn/mean": 3.9678614139556885, "rewards/reward_fn/std": 0.18180328607559204, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 346.90625, "completions/mean_terminated_length": 346.90625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.06852657261058662, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.018849076703190804, "learning_rate": 7.742e-06, "loss": -0.017, "num_tokens": 29950791.0, "reward": 3.6496143341064453, "reward_std": 0.6005396842956543, "rewards/reward_fn/mean": 3.6496143341064453, "rewards/reward_fn/std": 0.6005396842956543, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 181.40625, "completions/mean_terminated_length": 181.40625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.06863265089636152, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.01960382249671966, "learning_rate": 7.741599999999999e-06, "loss": 0.0008, "num_tokens": 29989428.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 381.21875, "completions/mean_terminated_length": 381.21875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.06873872918213642, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.02327621285803616, "learning_rate": 7.7412e-06, "loss": 0.1426, "num_tokens": 30037851.0, "reward": 3.4980902671813965, "reward_std": 0.7049679756164551, "rewards/reward_fn/mean": 3.4980902671813965, "rewards/reward_fn/std": 0.7049679160118103, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 299.90625, "completions/mean_terminated_length": 299.90625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.06884480746791131, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.018094201455824077, "learning_rate": 7.7408e-06, "loss": -0.0152, "num_tokens": 30095672.0, "reward": 3.6386489868164062, "reward_std": 0.8532451391220093, "rewards/reward_fn/mean": 3.6386489868164062, "rewards/reward_fn/std": 0.8532451391220093, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 153.125, "completions/mean_terminated_length": 153.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.06895088575368621, "frac_reward_zero_std": 1.0, "grad_norm": 0.11962890625, "kl": 0.023316435981541872, "learning_rate": 7.7404e-06, "loss": 0.0009, "num_tokens": 30131132.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 145.53125, "completions/mean_terminated_length": 145.53125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.06905696403946113, "frac_reward_zero_std": 0.0, "grad_norm": 3.03125, "kl": 0.026125010568648577, "learning_rate": 7.74e-06, "loss": -0.0946, "num_tokens": 30154061.0, "reward": 3.0254387855529785, "reward_std": 0.07992041856050491, "rewards/reward_fn/mean": 3.0254387855529785, "rewards/reward_fn/std": 0.07992040365934372, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 458.0625, "completions/mean_terminated_length": 458.0625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.06916304232523603, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.01891616778448224, "learning_rate": 7.7396e-06, "loss": 0.0773, "num_tokens": 30224079.0, "reward": 2.673313617706299, "reward_std": 0.30429506301879883, "rewards/reward_fn/mean": 2.673313617706299, "rewards/reward_fn/std": 0.30429503321647644, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 351.46875, "completions/mean_terminated_length": 351.46875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.06926912061101093, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.023535766871646047, "learning_rate": 7.7392e-06, "loss": -0.027, "num_tokens": 30265534.0, "reward": 3.135115623474121, "reward_std": 0.47822919487953186, "rewards/reward_fn/mean": 3.135115623474121, "rewards/reward_fn/std": 0.47822922468185425, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 461.40625, "completions/mean_terminated_length": 410.2257995605469, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.06937519889678583, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.018667538883164525, "learning_rate": 7.7388e-06, "loss": 0.1329, "num_tokens": 30315243.0, "reward": 3.3491392135620117, "reward_std": 1.0377370119094849, "rewards/reward_fn/mean": 3.3491392135620117, "rewards/reward_fn/std": 1.0377370119094849, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1697.0, "completions/max_terminated_length": 1697.0, "completions/mean_length": 685.59375, "completions/mean_terminated_length": 685.59375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.06948127718256072, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.01727759197819978, "learning_rate": 7.7384e-06, "loss": 0.0192, "num_tokens": 30373502.0, "reward": 3.753196954727173, "reward_std": 0.5557073950767517, "rewards/reward_fn/mean": 3.753196954727173, "rewards/reward_fn/std": 0.5557073950767517, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 181.21875, "completions/mean_terminated_length": 181.21875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.06958735546833564, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.018722419743426144, "learning_rate": 7.738e-06, "loss": 0.1536, "num_tokens": 30412837.0, "reward": 2.847813606262207, "reward_std": 0.032345082610845566, "rewards/reward_fn/mean": 2.847813606262207, "rewards/reward_fn/std": 0.03234507888555527, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 343.125, "completions/mean_terminated_length": 343.125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.06969343375411054, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.01867408840917051, "learning_rate": 7.737599999999999e-06, "loss": 0.0086, "num_tokens": 30456777.0, "reward": 2.8930726051330566, "reward_std": 0.20554324984550476, "rewards/reward_fn/mean": 2.8930726051330566, "rewards/reward_fn/std": 0.20554324984550476, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 438.90625, "completions/mean_terminated_length": 438.90625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.06979951203988544, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.013406037352979183, "learning_rate": 7.7372e-06, "loss": 0.0336, "num_tokens": 30491014.0, "reward": 2.6613709926605225, "reward_std": 0.055673278868198395, "rewards/reward_fn/mean": 2.6613709926605225, "rewards/reward_fn/std": 0.055673304945230484, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1197.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 352.9375, "completions/mean_terminated_length": 352.9375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.06990559032566034, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.02186316321603954, "learning_rate": 7.736799999999998e-06, "loss": 0.0747, "num_tokens": 30519428.0, "reward": 3.3577191829681396, "reward_std": 0.6026961803436279, "rewards/reward_fn/mean": 3.3577191829681396, "rewards/reward_fn/std": 0.6026961803436279, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 477.875, "completions/mean_terminated_length": 477.875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.07001166861143523, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.018300026771612465, "learning_rate": 7.7364e-06, "loss": -0.0899, "num_tokens": 30571616.0, "reward": 3.9199166297912598, "reward_std": 0.3151904344558716, "rewards/reward_fn/mean": 3.9199166297912598, "rewards/reward_fn/std": 0.3151904046535492, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1196.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 323.34375, "completions/mean_terminated_length": 323.34375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.07011774689721013, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.022360553964972496, "learning_rate": 7.736e-06, "loss": -0.0118, "num_tokens": 30612235.0, "reward": 2.710085391998291, "reward_std": 0.1893293261528015, "rewards/reward_fn/mean": 2.710085391998291, "rewards/reward_fn/std": 0.1893293410539627, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 146.6875, "completions/mean_terminated_length": 146.6875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.07022382518298505, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.021579281659796834, "learning_rate": 7.7356e-06, "loss": 0.0009, "num_tokens": 30645281.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 137.09375, "completions/mean_terminated_length": 137.09375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.07032990346875995, "frac_reward_zero_std": 0.0, "grad_norm": 3.171875, "kl": 0.02280471404083073, "learning_rate": 7.7352e-06, "loss": 0.0127, "num_tokens": 30693796.0, "reward": 3.1744942665100098, "reward_std": 0.06619588285684586, "rewards/reward_fn/mean": 3.1744942665100098, "rewards/reward_fn/std": 0.06619583815336227, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1954.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 472.78125, "completions/mean_terminated_length": 472.78125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.07043598175453485, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.019758876063860953, "learning_rate": 7.7348e-06, "loss": 0.1493, "num_tokens": 30727741.0, "reward": 3.272916316986084, "reward_std": 0.8355657458305359, "rewards/reward_fn/mean": 3.272916316986084, "rewards/reward_fn/std": 0.8355657458305359, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 184.9375, "completions/mean_terminated_length": 184.9375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.07054206004030975, "frac_reward_zero_std": 1.0, "grad_norm": 0.099609375, "kl": 0.020734936697408557, "learning_rate": 7.7344e-06, "loss": 0.0008, "num_tokens": 30782395.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 227.5625, "completions/mean_terminated_length": 227.5625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.07064813832608464, "frac_reward_zero_std": 1.0, "grad_norm": 0.1640625, "kl": 0.029407049994915724, "learning_rate": 7.733999999999999e-06, "loss": 0.0012, "num_tokens": 30830157.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1116.0, "completions/max_terminated_length": 1116.0, "completions/mean_length": 342.46875, "completions/mean_terminated_length": 342.46875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.07075421661185956, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.023776059737429023, "learning_rate": 7.7336e-06, "loss": 0.1323, "num_tokens": 30857724.0, "reward": 3.5412867069244385, "reward_std": 0.5026692152023315, "rewards/reward_fn/mean": 3.5412867069244385, "rewards/reward_fn/std": 0.5026691555976868, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1322.0, "completions/max_terminated_length": 1322.0, "completions/mean_length": 213.90625, "completions/mean_terminated_length": 213.90625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.07086029489763446, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.026352980406954885, "learning_rate": 7.733199999999999e-06, "loss": -0.1159, "num_tokens": 30882425.0, "reward": 3.824970245361328, "reward_std": 0.41358694434165955, "rewards/reward_fn/mean": 3.824970245361328, "rewards/reward_fn/std": 0.41358694434165955, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 550.3125, "completions/mean_terminated_length": 550.3125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.07096637318340936, "frac_reward_zero_std": 0.0, "grad_norm": 0.83203125, "kl": 0.017197473789565265, "learning_rate": 7.7328e-06, "loss": -0.086, "num_tokens": 30943363.0, "reward": 3.852973699569702, "reward_std": 0.4963712990283966, "rewards/reward_fn/mean": 3.852973699569702, "rewards/reward_fn/std": 0.4963712692260742, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 223.71875, "completions/mean_terminated_length": 223.71875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.07107245146918426, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.022278335178270936, "learning_rate": 7.732399999999999e-06, "loss": 0.0117, "num_tokens": 30996954.0, "reward": 3.4236133098602295, "reward_std": 0.5188043713569641, "rewards/reward_fn/mean": 3.4236133098602295, "rewards/reward_fn/std": 0.5188043713569641, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 473.53125, "completions/mean_terminated_length": 422.7419128417969, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.07117852975495916, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.027860619127750397, "learning_rate": 7.732e-06, "loss": 0.4003, "num_tokens": 31042411.0, "reward": 2.8622326850891113, "reward_std": 0.21642757952213287, "rewards/reward_fn/mean": 2.8622326850891113, "rewards/reward_fn/std": 0.21642759442329407, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 1063.59375, "completions/mean_terminated_length": 961.7586059570312, "completions/min_length": 436.0, "completions/min_terminated_length": 436.0, "epoch": 0.07128460804073407, "frac_reward_zero_std": 0.0, "grad_norm": 0.8828125, "kl": 0.015642878832295537, "learning_rate": 7.7316e-06, "loss": 0.1365, "num_tokens": 31111838.0, "reward": 2.18656063079834, "reward_std": 0.8373485207557678, "rewards/reward_fn/mean": 2.18656063079834, "rewards/reward_fn/std": 0.8373485207557678, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 370.34375, "completions/mean_terminated_length": 370.34375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.07139068632650897, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.017991895554587245, "learning_rate": 7.7312e-06, "loss": 0.0152, "num_tokens": 31155849.0, "reward": 3.9679219722747803, "reward_std": 0.18146038055419922, "rewards/reward_fn/mean": 3.9679219722747803, "rewards/reward_fn/std": 0.18146035075187683, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 234.96875, "completions/mean_terminated_length": 234.96875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.07149676461228387, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.02120826207101345, "learning_rate": 7.7308e-06, "loss": 0.0147, "num_tokens": 31203592.0, "reward": 3.9284887313842773, "reward_std": 0.4045286774635315, "rewards/reward_fn/mean": 3.9284887313842773, "rewards/reward_fn/std": 0.4045286476612091, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1212.0, "completions/max_terminated_length": 1212.0, "completions/mean_length": 323.625, "completions/mean_terminated_length": 323.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.07160284289805877, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.022802867693826556, "learning_rate": 7.7304e-06, "loss": -0.0868, "num_tokens": 31253724.0, "reward": 3.25797963142395, "reward_std": 0.7901930809020996, "rewards/reward_fn/mean": 3.25797963142395, "rewards/reward_fn/std": 0.7901931405067444, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 302.8125, "completions/mean_terminated_length": 302.8125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.07170892118383367, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.014244599267840385, "learning_rate": 7.73e-06, "loss": -0.0201, "num_tokens": 31296598.0, "reward": 2.8737645149230957, "reward_std": 0.03616030886769295, "rewards/reward_fn/mean": 2.8737645149230957, "rewards/reward_fn/std": 0.03616032004356384, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 601.28125, "completions/mean_terminated_length": 554.6129150390625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.07181499946960856, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.01887105731293559, "learning_rate": 7.7296e-06, "loss": 0.1873, "num_tokens": 31352735.0, "reward": 2.5296192169189453, "reward_std": 0.8162726759910583, "rewards/reward_fn/mean": 2.5296192169189453, "rewards/reward_fn/std": 0.8162726163864136, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 349.75, "completions/mean_terminated_length": 349.75, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.07192107775538348, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.024308583000674844, "learning_rate": 7.729199999999999e-06, "loss": 0.0147, "num_tokens": 31393367.0, "reward": 3.064675807952881, "reward_std": 1.1492177248001099, "rewards/reward_fn/mean": 3.064675807952881, "rewards/reward_fn/std": 1.1492178440093994, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 190.21875, "completions/mean_terminated_length": 190.21875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.07202715604115838, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.02310982788912952, "learning_rate": 7.7288e-06, "loss": -0.0445, "num_tokens": 31450398.0, "reward": 3.965233325958252, "reward_std": 0.1966707557439804, "rewards/reward_fn/mean": 3.965233325958252, "rewards/reward_fn/std": 0.1966707557439804, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 331.125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.07213323432693328, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.015066134044900537, "learning_rate": 7.728399999999999e-06, "loss": 0.087, "num_tokens": 31503330.0, "reward": 3.8951284885406494, "reward_std": 0.4410572648048401, "rewards/reward_fn/mean": 3.8951284885406494, "rewards/reward_fn/std": 0.4410572946071625, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 253.625, "completions/mean_terminated_length": 253.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.07223931261270818, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.022291683591902256, "learning_rate": 7.728e-06, "loss": -0.0688, "num_tokens": 31565494.0, "reward": 3.6002941131591797, "reward_std": 0.5299660563468933, "rewards/reward_fn/mean": 3.6002941131591797, "rewards/reward_fn/std": 0.5299659967422485, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 215.59375, "completions/mean_terminated_length": 215.59375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.07234539089848308, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.02722454722970724, "learning_rate": 7.727599999999999e-06, "loss": 0.0011, "num_tokens": 31621225.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 236.9375, "completions/mean_terminated_length": 236.9375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.07245146918425799, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.0214088864158839, "learning_rate": 7.7272e-06, "loss": 0.0853, "num_tokens": 31663399.0, "reward": 3.0963635444641113, "reward_std": 0.06334761530160904, "rewards/reward_fn/mean": 3.0963635444641113, "rewards/reward_fn/std": 0.06334759294986725, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1901.0, "completions/mean_length": 519.125, "completions/mean_terminated_length": 469.8064270019531, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.07255754747003289, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.023146436316892505, "learning_rate": 7.7268e-06, "loss": 0.2416, "num_tokens": 31720203.0, "reward": 2.755568742752075, "reward_std": 0.5547494292259216, "rewards/reward_fn/mean": 2.755568742752075, "rewards/reward_fn/std": 0.5547494888305664, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1290.0, "completions/max_terminated_length": 1290.0, "completions/mean_length": 560.8125, "completions/mean_terminated_length": 560.8125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.07266362575580779, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.022556508192792535, "learning_rate": 7.7264e-06, "loss": 0.1027, "num_tokens": 31787141.0, "reward": 2.8888206481933594, "reward_std": 0.36274033784866333, "rewards/reward_fn/mean": 2.8888206481933594, "rewards/reward_fn/std": 0.3627403676509857, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 2048.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 1064.65625, "completions/mean_terminated_length": 736.875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.07276970404158269, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.013749472331255674, "learning_rate": 7.726e-06, "loss": 0.3949, "num_tokens": 31860570.0, "reward": 2.409977436065674, "reward_std": 1.4189443588256836, "rewards/reward_fn/mean": 2.409977436065674, "rewards/reward_fn/std": 1.4189443588256836, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 227.40625, "completions/mean_terminated_length": 227.40625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.07287578232735759, "frac_reward_zero_std": 1.0, "grad_norm": 0.08056640625, "kl": 0.021269621676765382, "learning_rate": 7.7256e-06, "loss": 0.0009, "num_tokens": 31896007.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1312.0, "completions/max_terminated_length": 1312.0, "completions/mean_length": 328.40625, "completions/mean_terminated_length": 328.40625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.07298186061313248, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.03248383407481015, "learning_rate": 7.7252e-06, "loss": 0.2071, "num_tokens": 31945428.0, "reward": 3.6226577758789062, "reward_std": 0.5695524215698242, "rewards/reward_fn/mean": 3.6226577758789062, "rewards/reward_fn/std": 0.5695523619651794, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1452.0, "completions/max_terminated_length": 1452.0, "completions/mean_length": 275.75, "completions/mean_terminated_length": 275.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.0730879388989074, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.024565639439970255, "learning_rate": 7.7248e-06, "loss": -0.0209, "num_tokens": 32000332.0, "reward": 3.962454319000244, "reward_std": 0.2123897820711136, "rewards/reward_fn/mean": 3.962454319000244, "rewards/reward_fn/std": 0.21238979697227478, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 306.09375, "completions/mean_terminated_length": 306.09375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.0731940171846823, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.019663402810692787, "learning_rate": 7.7244e-06, "loss": 0.0733, "num_tokens": 32058191.0, "reward": 3.1262402534484863, "reward_std": 0.28920337557792664, "rewards/reward_fn/mean": 3.1262402534484863, "rewards/reward_fn/std": 0.289203405380249, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 489.0625, "completions/mean_terminated_length": 489.0625, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.0733000954704572, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.022890981985256076, "learning_rate": 7.724e-06, "loss": -0.002, "num_tokens": 32106673.0, "reward": 3.150282144546509, "reward_std": 0.45853471755981445, "rewards/reward_fn/mean": 3.150282144546509, "rewards/reward_fn/std": 0.4585346579551697, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1772.0, "completions/mean_length": 514.90625, "completions/mean_terminated_length": 412.70001220703125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.0734061737562321, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.021816590800881386, "learning_rate": 7.7236e-06, "loss": 0.1898, "num_tokens": 32143982.0, "reward": 2.987020969390869, "reward_std": 1.2113451957702637, "rewards/reward_fn/mean": 2.987020969390869, "rewards/reward_fn/std": 1.2113451957702637, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1178.0, "completions/mean_length": 559.5, "completions/mean_terminated_length": 460.2666931152344, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.073512252042007, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.02110014483332634, "learning_rate": 7.7232e-06, "loss": 0.2636, "num_tokens": 32183230.0, "reward": 2.5472910404205322, "reward_std": 0.872420072555542, "rewards/reward_fn/mean": 2.5472910404205322, "rewards/reward_fn/std": 0.872420072555542, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 708.1875, "completions/mean_terminated_length": 708.1875, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.07361833032778191, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.015690243802964687, "learning_rate": 7.7228e-06, "loss": 0.1217, "num_tokens": 32219076.0, "reward": 2.446472644805908, "reward_std": 0.4943988025188446, "rewards/reward_fn/mean": 2.446472644805908, "rewards/reward_fn/std": 0.4943988025188446, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 326.84375, "completions/mean_terminated_length": 326.84375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.07372440861355681, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.02663695439696312, "learning_rate": 7.7224e-06, "loss": 0.0019, "num_tokens": 32266015.0, "reward": 3.3688626289367676, "reward_std": 0.5659106373786926, "rewards/reward_fn/mean": 3.3688626289367676, "rewards/reward_fn/std": 0.5659106373786926, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 169.5625, "completions/mean_terminated_length": 169.5625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.07383048689933171, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.025914974743500352, "learning_rate": 7.722e-06, "loss": 0.001, "num_tokens": 32321521.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 366.65625, "completions/mean_terminated_length": 366.65625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.0739365651851066, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.022461338434368372, "learning_rate": 7.721599999999999e-06, "loss": 0.0393, "num_tokens": 32353830.0, "reward": 2.6947999000549316, "reward_std": 0.4774615168571472, "rewards/reward_fn/mean": 2.6947999000549316, "rewards/reward_fn/std": 0.47746148705482483, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 201.5625, "completions/mean_terminated_length": 201.5625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.0740426434708815, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.030215767910704017, "learning_rate": 7.7212e-06, "loss": 0.0134, "num_tokens": 32404856.0, "reward": 3.9019250869750977, "reward_std": 0.41585031151771545, "rewards/reward_fn/mean": 3.9019250869750977, "rewards/reward_fn/std": 0.41585028171539307, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 203.59375, "completions/mean_terminated_length": 203.59375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.07414872175665642, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.022973356302827597, "learning_rate": 7.720799999999999e-06, "loss": -0.0069, "num_tokens": 32430059.0, "reward": 3.9313559532165527, "reward_std": 0.38830989599227905, "rewards/reward_fn/mean": 3.9313559532165527, "rewards/reward_fn/std": 0.38830992579460144, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1273.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 319.5, "completions/mean_terminated_length": 319.5, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.07425480004243132, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.02324189874343574, "learning_rate": 7.7204e-06, "loss": 0.0154, "num_tokens": 32483995.0, "reward": 2.4519386291503906, "reward_std": 0.3796447217464447, "rewards/reward_fn/mean": 2.4519386291503906, "rewards/reward_fn/std": 0.3796447217464447, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 149.0, "completions/max_terminated_length": 149.0, "completions/mean_length": 108.46875, "completions/mean_terminated_length": 108.46875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.07436087832820622, "frac_reward_zero_std": 1.0, "grad_norm": 0.1513671875, "kl": 0.02412488660775125, "learning_rate": 7.719999999999999e-06, "loss": 0.001, "num_tokens": 32525034.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 168.1875, "completions/mean_terminated_length": 168.1875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.07446695661398112, "frac_reward_zero_std": 1.0, "grad_norm": 0.12255859375, "kl": 0.026652783853933215, "learning_rate": 7.7196e-06, "loss": 0.0011, "num_tokens": 32562288.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 288.71875, "completions/mean_terminated_length": 288.71875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.07457303489975602, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.023433730704709888, "learning_rate": 7.719199999999999e-06, "loss": -0.1515, "num_tokens": 32607111.0, "reward": 2.775975227355957, "reward_std": 0.7966500520706177, "rewards/reward_fn/mean": 2.775975227355957, "rewards/reward_fn/std": 0.7966500520706177, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1670.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 549.375, "completions/mean_terminated_length": 549.375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.07467911318553092, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.02032529003918171, "learning_rate": 7.7188e-06, "loss": 0.0651, "num_tokens": 32661107.0, "reward": 2.5700185298919678, "reward_std": 0.8670512437820435, "rewards/reward_fn/mean": 2.5700185298919678, "rewards/reward_fn/std": 0.8670512437820435, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 446.375, "completions/mean_terminated_length": 446.375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.07478519147130583, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.015976089402101934, "learning_rate": 7.718399999999999e-06, "loss": -0.0692, "num_tokens": 32709343.0, "reward": 3.776334285736084, "reward_std": 0.7065488696098328, "rewards/reward_fn/mean": 3.776334285736084, "rewards/reward_fn/std": 0.706548810005188, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 250.6875, "completions/mean_terminated_length": 250.6875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.07489126975708073, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.02156626316718757, "learning_rate": 7.718e-06, "loss": 0.0282, "num_tokens": 32751189.0, "reward": 3.279935359954834, "reward_std": 0.8665456771850586, "rewards/reward_fn/mean": 3.279935359954834, "rewards/reward_fn/std": 0.8665456175804138, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 203.4375, "completions/mean_terminated_length": 203.4375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.07499734804285563, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.031126373447477818, "learning_rate": 7.7176e-06, "loss": 0.0012, "num_tokens": 32808963.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 283.96875, "completions/mean_terminated_length": 283.96875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.07510342632863053, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.02349319658242166, "learning_rate": 7.7172e-06, "loss": 0.0009, "num_tokens": 32850274.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 175.03125, "completions/mean_terminated_length": 175.03125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.07520950461440543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.021260776673443615, "learning_rate": 7.7168e-06, "loss": 0.0009, "num_tokens": 32886243.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 252.46875, "completions/mean_terminated_length": 252.46875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.07531558290018034, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.025537249632179737, "learning_rate": 7.7164e-06, "loss": 0.0669, "num_tokens": 32930514.0, "reward": 2.982074737548828, "reward_std": 0.7072264552116394, "rewards/reward_fn/mean": 2.982074737548828, "rewards/reward_fn/std": 0.7072264552116394, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 413.5, "completions/mean_terminated_length": 413.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.07542166118595524, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.0188356313155964, "learning_rate": 7.716e-06, "loss": 0.1318, "num_tokens": 32981026.0, "reward": 2.7349185943603516, "reward_std": 0.4819990396499634, "rewards/reward_fn/mean": 2.7349185943603516, "rewards/reward_fn/std": 0.48199906945228577, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 225.03125, "completions/mean_terminated_length": 225.03125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.07552773947173014, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.011965643963776529, "learning_rate": 7.7156e-06, "loss": 0.0195, "num_tokens": 33032067.0, "reward": 3.974524974822998, "reward_std": 0.10026301443576813, "rewards/reward_fn/mean": 3.974524974822998, "rewards/reward_fn/std": 0.10026300698518753, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 248.59375, "completions/mean_terminated_length": 248.59375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.07563381775750504, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.022213632240891457, "learning_rate": 7.7152e-06, "loss": 0.0989, "num_tokens": 33063670.0, "reward": 3.016145944595337, "reward_std": 0.5643318295478821, "rewards/reward_fn/mean": 3.016145944595337, "rewards/reward_fn/std": 0.5643318295478821, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 259.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.07573989604327994, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.032502480084076524, "learning_rate": 7.7148e-06, "loss": -0.0117, "num_tokens": 33104906.0, "reward": 3.8165884017944336, "reward_std": 0.3915080428123474, "rewards/reward_fn/mean": 3.8165884017944336, "rewards/reward_fn/std": 0.3915080726146698, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 203.03125, "completions/mean_terminated_length": 203.03125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.07584597432905485, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.01976885157637298, "learning_rate": 7.7144e-06, "loss": 0.0008, "num_tokens": 33132939.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 146.34375, "completions/mean_terminated_length": 146.34375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.07595205261482975, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494140625, "kl": 0.0265513202175498, "learning_rate": 7.714e-06, "loss": 0.0011, "num_tokens": 33170582.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 281.71875, "completions/mean_terminated_length": 281.71875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.07605813090060465, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.023149944143369794, "learning_rate": 7.713599999999998e-06, "loss": -0.0468, "num_tokens": 33219533.0, "reward": 3.8506064414978027, "reward_std": 0.5878912210464478, "rewards/reward_fn/mean": 3.8506064414978027, "rewards/reward_fn/std": 0.5878912210464478, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1528.0, "completions/max_terminated_length": 1528.0, "completions/mean_length": 408.9375, "completions/mean_terminated_length": 408.9375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.07616420918637955, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.020193473668769002, "learning_rate": 7.7132e-06, "loss": 0.1427, "num_tokens": 33275499.0, "reward": 3.861990451812744, "reward_std": 0.3711107075214386, "rewards/reward_fn/mean": 3.861990451812744, "rewards/reward_fn/std": 0.371110737323761, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 311.1875, "completions/mean_terminated_length": 311.1875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.07627028747215445, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.01826619717758149, "learning_rate": 7.7128e-06, "loss": 0.0951, "num_tokens": 33316369.0, "reward": 3.852308511734009, "reward_std": 0.5811760425567627, "rewards/reward_fn/mean": 3.852308511734009, "rewards/reward_fn/std": 0.5811761021614075, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 93.78125, "completions/mean_terminated_length": 93.78125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.07637636575792935, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.025791630847379565, "learning_rate": 7.7124e-06, "loss": 0.001, "num_tokens": 33357226.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 664.59375, "completions/mean_terminated_length": 619.9677124023438, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.07648244404370426, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.014834558707661927, "learning_rate": 7.712e-06, "loss": 0.1671, "num_tokens": 33426845.0, "reward": 3.423288345336914, "reward_std": 0.9050231575965881, "rewards/reward_fn/mean": 3.423288345336914, "rewards/reward_fn/std": 0.9050231575965881, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1371.0, "completions/max_terminated_length": 1371.0, "completions/mean_length": 492.0625, "completions/mean_terminated_length": 492.0625, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.07658852232947916, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.01824855396989733, "learning_rate": 7.711599999999999e-06, "loss": 0.0541, "num_tokens": 33483199.0, "reward": 2.606292724609375, "reward_std": 0.2552023231983185, "rewards/reward_fn/mean": 2.606292724609375, "rewards/reward_fn/std": 0.2552023231983185, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1811.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 370.625, "completions/mean_terminated_length": 370.625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.07669460061525406, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.021258773282170296, "learning_rate": 7.7112e-06, "loss": -0.0289, "num_tokens": 33531699.0, "reward": 1.7273921966552734, "reward_std": 0.20154969394207, "rewards/reward_fn/mean": 1.7273921966552734, "rewards/reward_fn/std": 0.2015496790409088, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 275.5625, "completions/mean_terminated_length": 275.5625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.07680067890102896, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.018158360850065947, "learning_rate": 7.710799999999999e-06, "loss": -0.0268, "num_tokens": 33571397.0, "reward": 2.812833547592163, "reward_std": 0.2651287913322449, "rewards/reward_fn/mean": 2.812833547592163, "rewards/reward_fn/std": 0.2651287913322449, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1742.0, "completions/mean_length": 508.9375, "completions/mean_terminated_length": 459.2903137207031, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.07690675718680386, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.02326313848607242, "learning_rate": 7.7104e-06, "loss": 0.2781, "num_tokens": 33636419.0, "reward": 2.658267021179199, "reward_std": 0.5596959590911865, "rewards/reward_fn/mean": 2.658267021179199, "rewards/reward_fn/std": 0.5596958994865417, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 272.8125, "completions/mean_terminated_length": 272.8125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.07701283547257877, "frac_reward_zero_std": 1.0, "grad_norm": 0.0634765625, "kl": 0.015897757722996175, "learning_rate": 7.709999999999999e-06, "loss": 0.0006, "num_tokens": 33686141.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1789.0, "completions/max_terminated_length": 1789.0, "completions/mean_length": 474.3125, "completions/mean_terminated_length": 474.3125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.07711891375835367, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.016974500380456448, "learning_rate": 7.7096e-06, "loss": -0.0312, "num_tokens": 33745319.0, "reward": 3.0468087196350098, "reward_std": 0.5193257927894592, "rewards/reward_fn/mean": 3.0468087196350098, "rewards/reward_fn/std": 0.5193257331848145, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 348.0, "completions/mean_terminated_length": 348.0, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.07722499204412857, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.01765015278942883, "learning_rate": 7.709199999999999e-06, "loss": -0.0524, "num_tokens": 33797127.0, "reward": 3.139650821685791, "reward_std": 0.5897535681724548, "rewards/reward_fn/mean": 3.139650821685791, "rewards/reward_fn/std": 0.5897536277770996, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 532.4375, "completions/mean_terminated_length": 483.5483703613281, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.07733107032990347, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.017708882922306657, "learning_rate": 7.7088e-06, "loss": 0.2641, "num_tokens": 33851317.0, "reward": 2.595705986022949, "reward_std": 0.5950980186462402, "rewards/reward_fn/mean": 2.595705986022949, "rewards/reward_fn/std": 0.595098078250885, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 321.4375, "completions/mean_terminated_length": 321.4375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.07743714861567837, "frac_reward_zero_std": 0.0, "grad_norm": 0.80078125, "kl": 0.022806552005931735, "learning_rate": 7.7084e-06, "loss": -0.1221, "num_tokens": 33880131.0, "reward": 3.959925651550293, "reward_std": 0.226694256067276, "rewards/reward_fn/mean": 3.959925651550293, "rewards/reward_fn/std": 0.2266942858695984, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1690.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 309.5625, "completions/mean_terminated_length": 309.5625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.07754322690145327, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.018843404366634786, "learning_rate": 7.708e-06, "loss": 0.2778, "num_tokens": 33933077.0, "reward": 3.921745777130127, "reward_std": 0.30792757868766785, "rewards/reward_fn/mean": 3.921745777130127, "rewards/reward_fn/std": 0.30792760848999023, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 172.0, "completions/max_terminated_length": 172.0, "completions/mean_length": 102.9375, "completions/mean_terminated_length": 102.9375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.07764930518722818, "frac_reward_zero_std": 1.0, "grad_norm": 0.16796875, "kl": 0.023415476083755493, "learning_rate": 7.7076e-06, "loss": 0.0009, "num_tokens": 33957107.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 468.9375, "completions/mean_terminated_length": 468.9375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.07775538347300308, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.019135850947350264, "learning_rate": 7.7072e-06, "loss": 0.0913, "num_tokens": 34012753.0, "reward": 3.1053218841552734, "reward_std": 0.46024301648139954, "rewards/reward_fn/mean": 3.1053218841552734, "rewards/reward_fn/std": 0.46024298667907715, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1717.0, "completions/mean_length": 418.90625, "completions/mean_terminated_length": 366.3548278808594, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.07786146175877798, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.017279536346904933, "learning_rate": 7.7068e-06, "loss": 0.3037, "num_tokens": 34061870.0, "reward": 3.875, "reward_std": 0.7071067690849304, "rewards/reward_fn/mean": 3.875, "rewards/reward_fn/std": 0.7071067690849304, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1323.0, "completions/mean_length": 606.53125, "completions/mean_terminated_length": 560.0322265625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.07796754004455288, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.014971327618695796, "learning_rate": 7.7064e-06, "loss": 0.1449, "num_tokens": 34126623.0, "reward": 2.847194194793701, "reward_std": 0.6151965260505676, "rewards/reward_fn/mean": 2.847194194793701, "rewards/reward_fn/std": 0.6151964664459229, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 108.0, "completions/mean_terminated_length": 108.0, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.07807361833032778, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.013533458928577602, "learning_rate": 7.706e-06, "loss": 0.0005, "num_tokens": 34173343.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 296.53125, "completions/mean_terminated_length": 296.53125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.07817969661610269, "frac_reward_zero_std": 1.0, "grad_norm": 0.0625, "kl": 0.018062757910229266, "learning_rate": 7.7056e-06, "loss": 0.0007, "num_tokens": 34217328.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1122.0, "completions/max_terminated_length": 1122.0, "completions/mean_length": 395.125, "completions/mean_terminated_length": 395.125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.07828577490187759, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.014009194332174957, "learning_rate": 7.705199999999999e-06, "loss": -0.0219, "num_tokens": 34247092.0, "reward": 2.814393997192383, "reward_std": 0.20588137209415436, "rewards/reward_fn/mean": 2.814393997192383, "rewards/reward_fn/std": 0.20588135719299316, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1819.0, "completions/max_terminated_length": 1819.0, "completions/mean_length": 479.1875, "completions/mean_terminated_length": 479.1875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.07839185318765249, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.016489942325279117, "learning_rate": 7.7048e-06, "loss": 0.0784, "num_tokens": 34301050.0, "reward": 2.9646849632263184, "reward_std": 0.4474869966506958, "rewards/reward_fn/mean": 2.9646849632263184, "rewards/reward_fn/std": 0.4474869966506958, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 460.28125, "completions/mean_terminated_length": 409.06451416015625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.07849793147342739, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.021060561761260033, "learning_rate": 7.704399999999999e-06, "loss": 0.2307, "num_tokens": 34379779.0, "reward": 3.593684673309326, "reward_std": 0.8480998873710632, "rewards/reward_fn/mean": 3.593684673309326, "rewards/reward_fn/std": 0.8480998873710632, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 234.625, "completions/mean_terminated_length": 234.625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.07860400975920229, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "kl": 0.026116218185052276, "learning_rate": 7.704e-06, "loss": -0.3654, "num_tokens": 34420247.0, "reward": 3.2040047645568848, "reward_std": 0.2753956615924835, "rewards/reward_fn/mean": 3.2040047645568848, "rewards/reward_fn/std": 0.27539563179016113, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 304.15625, "completions/mean_terminated_length": 304.15625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.0787100880449772, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.0184671861352399, "learning_rate": 7.7036e-06, "loss": -0.0289, "num_tokens": 34460348.0, "reward": 3.8210489749908447, "reward_std": 0.423511803150177, "rewards/reward_fn/mean": 3.8210489749908447, "rewards/reward_fn/std": 0.4235118627548218, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1977.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 362.90625, "completions/mean_terminated_length": 362.90625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.0788161663307521, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.02415016118902713, "learning_rate": 7.7032e-06, "loss": 0.0105, "num_tokens": 34512857.0, "reward": 3.9323811531066895, "reward_std": 0.2660841941833496, "rewards/reward_fn/mean": 3.9323811531066895, "rewards/reward_fn/std": 0.2660841643810272, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1125.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 401.65625, "completions/mean_terminated_length": 401.65625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.078922244616527, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.022820353973656893, "learning_rate": 7.7028e-06, "loss": 0.0953, "num_tokens": 34545998.0, "reward": 2.9798271656036377, "reward_std": 0.3881252706050873, "rewards/reward_fn/mean": 2.9798271656036377, "rewards/reward_fn/std": 0.38812533020973206, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 161.46875, "completions/mean_terminated_length": 161.46875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.0790283229023019, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.018742251209914684, "learning_rate": 7.7024e-06, "loss": 0.0007, "num_tokens": 34589949.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1595.0, "completions/max_terminated_length": 1595.0, "completions/mean_length": 347.75, "completions/mean_terminated_length": 347.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.0791344011880768, "frac_reward_zero_std": 1.0, "grad_norm": 0.05224609375, "kl": 0.016581954550929368, "learning_rate": 7.702e-06, "loss": 0.0007, "num_tokens": 34637141.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1318.0, "completions/mean_length": 413.0625, "completions/mean_terminated_length": 360.32257080078125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.0792404794738517, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.01734267408028245, "learning_rate": 7.7016e-06, "loss": 0.1899, "num_tokens": 34671959.0, "reward": 3.5228824615478516, "reward_std": 0.6560260057449341, "rewards/reward_fn/mean": 3.5228824615478516, "rewards/reward_fn/std": 0.6560259461402893, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 469.125, "completions/mean_terminated_length": 363.86669921875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.07934655775962661, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.016482632607221603, "learning_rate": 7.7012e-06, "loss": 0.3411, "num_tokens": 34707547.0, "reward": 2.742053508758545, "reward_std": 0.8046372532844543, "rewards/reward_fn/mean": 2.742053508758545, "rewards/reward_fn/std": 0.8046371936798096, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 248.625, "completions/mean_terminated_length": 248.625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.07945263604540151, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.02026499854400754, "learning_rate": 7.7008e-06, "loss": 0.0099, "num_tokens": 34761615.0, "reward": 3.801016330718994, "reward_std": 0.42593979835510254, "rewards/reward_fn/mean": 3.801016330718994, "rewards/reward_fn/std": 0.42593976855278015, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 186.3125, "completions/mean_terminated_length": 186.3125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.07955871433117641, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.015264550573192537, "learning_rate": 7.7004e-06, "loss": -0.0742, "num_tokens": 34811801.0, "reward": 3.933756113052368, "reward_std": 0.3747324049472809, "rewards/reward_fn/mean": 3.933756113052368, "rewards/reward_fn/std": 0.37473243474960327, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1936.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 328.8125, "completions/mean_terminated_length": 328.8125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.07966479261695131, "frac_reward_zero_std": 1.0, "grad_norm": 0.11279296875, "kl": 0.01918186468537897, "learning_rate": 7.699999999999999e-06, "loss": 0.0008, "num_tokens": 34858419.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1659.0, "completions/max_terminated_length": 1659.0, "completions/mean_length": 412.8125, "completions/mean_terminated_length": 412.8125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.07977087090272621, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.02011913899332285, "learning_rate": 7.6996e-06, "loss": -0.0163, "num_tokens": 34923021.0, "reward": 3.248586893081665, "reward_std": 0.7823631763458252, "rewards/reward_fn/mean": 3.248586893081665, "rewards/reward_fn/std": 0.7823631763458252, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1150.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 188.375, "completions/mean_terminated_length": 188.375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.07987694918850112, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.02701102104038, "learning_rate": 7.6992e-06, "loss": 0.1138, "num_tokens": 34963833.0, "reward": 3.7283644676208496, "reward_std": 0.5220069885253906, "rewards/reward_fn/mean": 3.7283644676208496, "rewards/reward_fn/std": 0.5220070481300354, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1733.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 799.8125, "completions/mean_terminated_length": 799.8125, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.07998302747427602, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.013593915617093444, "learning_rate": 7.6988e-06, "loss": 0.0106, "num_tokens": 35034419.0, "reward": 2.5703787803649902, "reward_std": 0.6685802340507507, "rewards/reward_fn/mean": 2.5703787803649902, "rewards/reward_fn/std": 0.668580174446106, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1257.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.08008910576005092, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.017514656181447208, "learning_rate": 7.6984e-06, "loss": 0.3551, "num_tokens": 35078991.0, "reward": 3.92777681350708, "reward_std": 0.4085560739040375, "rewards/reward_fn/mean": 3.92777681350708, "rewards/reward_fn/std": 0.4085560142993927, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 200.375, "completions/mean_terminated_length": 200.375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.08019518404582582, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.022400660207495093, "learning_rate": 7.698e-06, "loss": 0.0009, "num_tokens": 35115003.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 245.65625, "completions/mean_terminated_length": 245.65625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.08030126233160072, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.01861444814130664, "learning_rate": 7.6976e-06, "loss": 0.0078, "num_tokens": 35159408.0, "reward": 2.909533977508545, "reward_std": 0.039463140070438385, "rewards/reward_fn/mean": 2.909533977508545, "rewards/reward_fn/std": 0.03946312144398689, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 254.03125, "completions/mean_terminated_length": 254.03125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.08040734061737562, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.018796015181578696, "learning_rate": 7.6972e-06, "loss": 0.0522, "num_tokens": 35185681.0, "reward": 3.875, "reward_std": 0.7071067690849304, "rewards/reward_fn/mean": 3.875, "rewards/reward_fn/std": 0.7071067690849304, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 317.6875, "completions/mean_terminated_length": 317.6875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.08051341890315053, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.015750034246593714, "learning_rate": 7.696799999999999e-06, "loss": 0.0456, "num_tokens": 35228807.0, "reward": 3.8852972984313965, "reward_std": 0.3084171712398529, "rewards/reward_fn/mean": 3.8852972984313965, "rewards/reward_fn/std": 0.3084172010421753, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1313.0, "completions/max_terminated_length": 1313.0, "completions/mean_length": 429.1875, "completions/mean_terminated_length": 429.1875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.08061949718892543, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.016051248530857265, "learning_rate": 7.6964e-06, "loss": 0.0066, "num_tokens": 35264141.0, "reward": 3.350024700164795, "reward_std": 0.5828197002410889, "rewards/reward_fn/mean": 3.350024700164795, "rewards/reward_fn/std": 0.5828196406364441, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 641.0, "completions/mean_terminated_length": 495.4482727050781, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.08072557547470033, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.01636024343315512, "learning_rate": 7.695999999999999e-06, "loss": 0.2756, "num_tokens": 35323661.0, "reward": 3.2801287174224854, "reward_std": 1.179065227508545, "rewards/reward_fn/mean": 3.2801287174224854, "rewards/reward_fn/std": 1.179065227508545, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1244.0, "completions/mean_length": 338.78125, "completions/mean_terminated_length": 283.6451416015625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.08083165376047523, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.023242034018039703, "learning_rate": 7.6956e-06, "loss": 0.3512, "num_tokens": 35355654.0, "reward": 3.727604866027832, "reward_std": 0.7867724895477295, "rewards/reward_fn/mean": 3.727604866027832, "rewards/reward_fn/std": 0.7867724895477295, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1400.0, "completions/mean_length": 434.65625, "completions/mean_terminated_length": 382.6128845214844, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.08093773204625013, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.020363064482808113, "learning_rate": 7.695199999999999e-06, "loss": 0.166, "num_tokens": 35397915.0, "reward": 2.0777950286865234, "reward_std": 0.674912691116333, "rewards/reward_fn/mean": 2.0777950286865234, "rewards/reward_fn/std": 0.6749126315116882, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1564.0, "completions/max_terminated_length": 1564.0, "completions/mean_length": 552.4375, "completions/mean_terminated_length": 552.4375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.08104381033202504, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.01603428611997515, "learning_rate": 7.6948e-06, "loss": 0.0247, "num_tokens": 35448105.0, "reward": 2.9090466499328613, "reward_std": 0.07430879026651382, "rewards/reward_fn/mean": 2.9090466499328613, "rewards/reward_fn/std": 0.07430876046419144, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 366.125, "completions/mean_terminated_length": 311.8709716796875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.08114988861779994, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.019333304720930755, "learning_rate": 7.6944e-06, "loss": 0.2851, "num_tokens": 35477549.0, "reward": 3.4079909324645996, "reward_std": 0.7734149098396301, "rewards/reward_fn/mean": 3.4079909324645996, "rewards/reward_fn/std": 0.7734148502349854, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 350.1875, "completions/mean_terminated_length": 350.1875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.08125596690357484, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.020013973116874695, "learning_rate": 7.694e-06, "loss": 0.0958, "num_tokens": 35539795.0, "reward": 3.3708925247192383, "reward_std": 0.7116749286651611, "rewards/reward_fn/mean": 3.3708925247192383, "rewards/reward_fn/std": 0.7116749286651611, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1595.0, "completions/mean_length": 510.46875, "completions/mean_terminated_length": 460.8709411621094, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.08136204518934974, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.013919757795520127, "learning_rate": 7.6936e-06, "loss": 0.2956, "num_tokens": 35605730.0, "reward": 3.875, "reward_std": 0.7071067690849304, "rewards/reward_fn/mean": 3.875, "rewards/reward_fn/std": 0.7071067690849304, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 571.15625, "completions/mean_terminated_length": 523.51611328125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.08146812347512464, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.019678838085383177, "learning_rate": 7.6932e-06, "loss": 0.0894, "num_tokens": 35654567.0, "reward": 2.3275630474090576, "reward_std": 0.6938040256500244, "rewards/reward_fn/mean": 2.3275630474090576, "rewards/reward_fn/std": 0.6938039660453796, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 202.84375, "completions/mean_terminated_length": 202.84375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.08157420176089955, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.019846069626510143, "learning_rate": 7.6928e-06, "loss": 0.0203, "num_tokens": 35690050.0, "reward": 3.797757625579834, "reward_std": 0.4278443157672882, "rewards/reward_fn/mean": 3.797757625579834, "rewards/reward_fn/std": 0.4278443455696106, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1466.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 375.28125, "completions/mean_terminated_length": 375.28125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.08168028004667445, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.022206757916137576, "learning_rate": 7.6924e-06, "loss": -0.0462, "num_tokens": 35735115.0, "reward": 2.7710344791412354, "reward_std": 0.07037150859832764, "rewards/reward_fn/mean": 2.7710344791412354, "rewards/reward_fn/std": 0.07037156820297241, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 173.28125, "completions/mean_terminated_length": 173.28125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.08178635833244935, "frac_reward_zero_std": 1.0, "grad_norm": 0.1748046875, "kl": 0.02803934703115374, "learning_rate": 7.692e-06, "loss": 0.0011, "num_tokens": 35761108.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 257.5625, "completions/mean_terminated_length": 257.5625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.08189243661822425, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.015047145425342023, "learning_rate": 7.6916e-06, "loss": -0.0131, "num_tokens": 35800998.0, "reward": 3.958484649658203, "reward_std": 0.23484668135643005, "rewards/reward_fn/mean": 3.958484649658203, "rewards/reward_fn/std": 0.23484671115875244, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 406.8125, "completions/mean_terminated_length": 406.8125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.08199851490399915, "frac_reward_zero_std": 1.0, "grad_norm": 0.05859375, "kl": 0.015709312865510583, "learning_rate": 7.6912e-06, "loss": 0.0006, "num_tokens": 35847232.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1601.0, "completions/max_terminated_length": 1601.0, "completions/mean_length": 267.65625, "completions/mean_terminated_length": 267.65625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.08210459318977405, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.017633047536946833, "learning_rate": 7.6908e-06, "loss": 0.1638, "num_tokens": 35907957.0, "reward": 2.982841730117798, "reward_std": 0.028663719072937965, "rewards/reward_fn/mean": 2.982841730117798, "rewards/reward_fn/std": 0.028663722798228264, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 294.3125, "completions/mean_terminated_length": 294.3125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.08221067147554896, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.015748307458125055, "learning_rate": 7.6904e-06, "loss": -0.0098, "num_tokens": 35951743.0, "reward": 3.4863507747650146, "reward_std": 0.9863615036010742, "rewards/reward_fn/mean": 3.4863507747650146, "rewards/reward_fn/std": 0.9863614439964294, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1481.0, "completions/max_terminated_length": 1481.0, "completions/mean_length": 532.0625, "completions/mean_terminated_length": 532.0625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.08231674976132386, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.01679909380618483, "learning_rate": 7.69e-06, "loss": 0.0363, "num_tokens": 36001249.0, "reward": 3.2947301864624023, "reward_std": 0.632847249507904, "rewards/reward_fn/mean": 3.2947301864624023, "rewards/reward_fn/std": 0.6328471899032593, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 421.5, "completions/mean_terminated_length": 369.0322570800781, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.08242282804709876, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.020656271022744477, "learning_rate": 7.6896e-06, "loss": 0.2965, "num_tokens": 36050065.0, "reward": 2.78275203704834, "reward_std": 0.5125721096992493, "rewards/reward_fn/mean": 2.78275203704834, "rewards/reward_fn/std": 0.5125721096992493, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 440.625, "completions/mean_terminated_length": 440.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.08252890633287366, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.01858210703358054, "learning_rate": 7.6892e-06, "loss": 0.1469, "num_tokens": 36102053.0, "reward": 3.6386284828186035, "reward_std": 0.5598682761192322, "rewards/reward_fn/mean": 3.6386284828186035, "rewards/reward_fn/std": 0.559868335723877, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 107.40625, "completions/mean_terminated_length": 107.40625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.08263498461864856, "frac_reward_zero_std": 1.0, "grad_norm": 0.111328125, "kl": 0.017615402408409864, "learning_rate": 7.6888e-06, "loss": 0.0007, "num_tokens": 36141522.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 498.15625, "completions/mean_terminated_length": 498.15625, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.08274106290442347, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.020500344457104802, "learning_rate": 7.688399999999999e-06, "loss": 0.0002, "num_tokens": 36191415.0, "reward": 2.5993824005126953, "reward_std": 0.1720532327890396, "rewards/reward_fn/mean": 2.5993824005126953, "rewards/reward_fn/std": 0.1720532774925232, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 277.96875, "completions/mean_terminated_length": 277.96875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.08284714119019837, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.01927991397678852, "learning_rate": 7.688e-06, "loss": 0.0236, "num_tokens": 36230550.0, "reward": 2.7684693336486816, "reward_std": 0.029289107769727707, "rewards/reward_fn/mean": 2.7684693336486816, "rewards/reward_fn/std": 0.029289091005921364, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 131.03125, "completions/mean_terminated_length": 131.03125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.08295321947597327, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.010917730629444122, "learning_rate": 7.687599999999999e-06, "loss": 0.0665, "num_tokens": 36268119.0, "reward": 3.0935535430908203, "reward_std": 0.016159607097506523, "rewards/reward_fn/mean": 3.0935535430908203, "rewards/reward_fn/std": 0.016159581020474434, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 546.78125, "completions/mean_terminated_length": 546.78125, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.08305929776174817, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.010865212534554303, "learning_rate": 7.6872e-06, "loss": 0.0287, "num_tokens": 36342576.0, "reward": 3.487700939178467, "reward_std": 0.5917753577232361, "rewards/reward_fn/mean": 3.487700939178467, "rewards/reward_fn/std": 0.5917754173278809, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 360.71875, "completions/mean_terminated_length": 360.71875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.08316537604752307, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.024502220330759883, "learning_rate": 7.686799999999999e-06, "loss": 0.0588, "num_tokens": 36373287.0, "reward": 3.8351099491119385, "reward_std": 0.3897174298763275, "rewards/reward_fn/mean": 3.8351099491119385, "rewards/reward_fn/std": 0.38971734046936035, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1505.0, "completions/mean_length": 666.5625, "completions/mean_terminated_length": 574.4666748046875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.08327145433329797, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.01942377514205873, "learning_rate": 7.6864e-06, "loss": 0.3233, "num_tokens": 36432665.0, "reward": 2.4874258041381836, "reward_std": 0.7904840111732483, "rewards/reward_fn/mean": 2.4874258041381836, "rewards/reward_fn/std": 0.7904840111732483, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 242.3125, "completions/mean_terminated_length": 242.3125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.08337753261907288, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.031069141812622547, "learning_rate": 7.685999999999999e-06, "loss": 0.0068, "num_tokens": 36487043.0, "reward": 3.757603168487549, "reward_std": 0.4073527753353119, "rewards/reward_fn/mean": 3.757603168487549, "rewards/reward_fn/std": 0.4073527753353119, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 190.9375, "completions/mean_terminated_length": 190.9375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.08348361090484778, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.023711836896836758, "learning_rate": 7.6856e-06, "loss": 0.0631, "num_tokens": 36527585.0, "reward": 2.923456907272339, "reward_std": 0.28942155838012695, "rewards/reward_fn/mean": 2.923456907272339, "rewards/reward_fn/std": 0.28942152857780457, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 271.09375, "completions/mean_terminated_length": 271.09375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.08358968919062268, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.020150523632764816, "learning_rate": 7.685199999999999e-06, "loss": 0.0728, "num_tokens": 36567908.0, "reward": 3.884533405303955, "reward_std": 0.47682517766952515, "rewards/reward_fn/mean": 3.884533405303955, "rewards/reward_fn/std": 0.47682514786720276, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 202.4375, "completions/mean_terminated_length": 202.4375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.08369576747639758, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.03790642227977514, "learning_rate": 7.6848e-06, "loss": 0.0211, "num_tokens": 36607410.0, "reward": 3.8924498558044434, "reward_std": 0.3397402763366699, "rewards/reward_fn/mean": 3.8924498558044434, "rewards/reward_fn/std": 0.3397402763366699, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 501.4375, "completions/mean_terminated_length": 501.4375, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.08380184576217248, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.02125004376284778, "learning_rate": 7.6844e-06, "loss": 0.0061, "num_tokens": 36672960.0, "reward": 2.670154571533203, "reward_std": 0.5752670168876648, "rewards/reward_fn/mean": 2.670154571533203, "rewards/reward_fn/std": 0.57526695728302, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 455.21875, "completions/mean_terminated_length": 455.21875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.08390792404794739, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.020999554311856627, "learning_rate": 7.684e-06, "loss": 0.0642, "num_tokens": 36732487.0, "reward": 3.457927942276001, "reward_std": 0.5872803330421448, "rewards/reward_fn/mean": 3.457927942276001, "rewards/reward_fn/std": 0.5872803330421448, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 186.1875, "completions/mean_terminated_length": 186.1875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.08401400233372229, "frac_reward_zero_std": 1.0, "grad_norm": 0.1826171875, "kl": 0.032360953744500875, "learning_rate": 7.6836e-06, "loss": 0.0013, "num_tokens": 36765805.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 151.46875, "completions/mean_terminated_length": 151.46875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.08412008061949719, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.021889066323637962, "learning_rate": 7.6832e-06, "loss": 0.0009, "num_tokens": 36804956.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 199.96875, "completions/mean_terminated_length": 199.96875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.08422615890527209, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.018962694564834237, "learning_rate": 7.6828e-06, "loss": 0.0008, "num_tokens": 36844315.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1186.0, "completions/max_terminated_length": 1186.0, "completions/mean_length": 337.875, "completions/mean_terminated_length": 337.875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.08433223719104699, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.021812525810673833, "learning_rate": 7.6824e-06, "loss": 0.1178, "num_tokens": 36890487.0, "reward": 3.9017152786254883, "reward_std": 0.3109425902366638, "rewards/reward_fn/mean": 3.9017152786254883, "rewards/reward_fn/std": 0.3109425902366638, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1777.0, "completions/mean_length": 704.53125, "completions/mean_terminated_length": 614.9666748046875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.0844383154768219, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.02236817847006023, "learning_rate": 7.682e-06, "loss": 0.3403, "num_tokens": 36943112.0, "reward": 2.4587063789367676, "reward_std": 0.5394301414489746, "rewards/reward_fn/mean": 2.4587063789367676, "rewards/reward_fn/std": 0.5394301414489746, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1273.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 536.34375, "completions/mean_terminated_length": 536.34375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.0845443937625968, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.02385992626659572, "learning_rate": 7.6816e-06, "loss": 0.0694, "num_tokens": 36997363.0, "reward": 2.4099903106689453, "reward_std": 0.5640437602996826, "rewards/reward_fn/mean": 2.4099903106689453, "rewards/reward_fn/std": 0.5640437602996826, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 149.15625, "completions/mean_terminated_length": 149.15625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.0846504720483717, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.017519668908789754, "learning_rate": 7.681199999999999e-06, "loss": 0.0007, "num_tokens": 37028024.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 313.40625, "completions/mean_terminated_length": 313.40625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.0847565503341466, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.022777023958042264, "learning_rate": 7.6808e-06, "loss": 0.0037, "num_tokens": 37076261.0, "reward": 3.26804256439209, "reward_std": 0.7380927801132202, "rewards/reward_fn/mean": 3.26804256439209, "rewards/reward_fn/std": 0.7380927801132202, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 220.0, "completions/mean_terminated_length": 220.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.0848626286199215, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.04452996770851314, "learning_rate": 7.680399999999998e-06, "loss": 0.074, "num_tokens": 37114917.0, "reward": 3.9180896282196045, "reward_std": 0.26771649718284607, "rewards/reward_fn/mean": 3.9180896282196045, "rewards/reward_fn/std": 0.26771649718284607, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 374.96875, "completions/mean_terminated_length": 374.96875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.0849687069056964, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.01646363304462284, "learning_rate": 7.68e-06, "loss": 0.0849, "num_tokens": 37163716.0, "reward": 2.890430450439453, "reward_std": 0.055527813732624054, "rewards/reward_fn/mean": 2.890430450439453, "rewards/reward_fn/std": 0.05552782118320465, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 156.5, "completions/mean_terminated_length": 156.5, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.08507478519147131, "frac_reward_zero_std": 1.0, "grad_norm": 0.16796875, "kl": 0.03423686744645238, "learning_rate": 7.6796e-06, "loss": 0.0014, "num_tokens": 37213556.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 260.8125, "completions/mean_terminated_length": 260.8125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.08518086347724621, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.01638385117985308, "learning_rate": 7.6792e-06, "loss": -0.076, "num_tokens": 37258478.0, "reward": 3.609133243560791, "reward_std": 0.5494344234466553, "rewards/reward_fn/mean": 3.609133243560791, "rewards/reward_fn/std": 0.5494344234466553, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 448.46875, "completions/mean_terminated_length": 448.46875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.08528694176302111, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.020947554847225547, "learning_rate": 7.6788e-06, "loss": -0.1078, "num_tokens": 37296669.0, "reward": 3.074223518371582, "reward_std": 0.5923977494239807, "rewards/reward_fn/mean": 3.074223518371582, "rewards/reward_fn/std": 0.5923976898193359, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 110.90625, "completions/mean_terminated_length": 110.90625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.08539302004879601, "frac_reward_zero_std": 1.0, "grad_norm": 0.1103515625, "kl": 0.017318370169959962, "learning_rate": 7.6784e-06, "loss": 0.0007, "num_tokens": 37333658.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 197.4375, "completions/mean_terminated_length": 197.4375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.08549909833457091, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.02576698805205524, "learning_rate": 7.678e-06, "loss": 0.0374, "num_tokens": 37370536.0, "reward": 3.7869386672973633, "reward_std": 0.6730425357818604, "rewards/reward_fn/mean": 3.7869386672973633, "rewards/reward_fn/std": 0.6730424761772156, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 171.59375, "completions/mean_terminated_length": 171.59375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.08560517662034582, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.022562434431165457, "learning_rate": 7.677599999999999e-06, "loss": 0.017, "num_tokens": 37412859.0, "reward": 3.9462358951568604, "reward_std": 0.21158860623836517, "rewards/reward_fn/mean": 3.9462358951568604, "rewards/reward_fn/std": 0.21158860623836517, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 796.0, "completions/max_terminated_length": 796.0, "completions/mean_length": 177.53125, "completions/mean_terminated_length": 177.53125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.08571125490612072, "frac_reward_zero_std": 1.0, "grad_norm": 0.19921875, "kl": 0.029298200272023678, "learning_rate": 7.6772e-06, "loss": 0.0012, "num_tokens": 37452204.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 171.09375, "completions/mean_terminated_length": 171.09375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.08581733319189562, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.028025910491123796, "learning_rate": 7.676799999999999e-06, "loss": 0.0011, "num_tokens": 37508335.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 263.25, "completions/mean_terminated_length": 263.25, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.08592341147767052, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.020565056474879384, "learning_rate": 7.6764e-06, "loss": -0.0028, "num_tokens": 37538775.0, "reward": 3.859889507293701, "reward_std": 0.5514804124832153, "rewards/reward_fn/mean": 3.859889507293701, "rewards/reward_fn/std": 0.5514804124832153, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1669.0, "completions/max_terminated_length": 1669.0, "completions/mean_length": 508.625, "completions/mean_terminated_length": 508.625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.08602948976344542, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.016198519384488463, "learning_rate": 7.675999999999999e-06, "loss": 0.1003, "num_tokens": 37572555.0, "reward": 3.0730252265930176, "reward_std": 0.4106665253639221, "rewards/reward_fn/mean": 3.0730252265930176, "rewards/reward_fn/std": 0.4106665253639221, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 251.84375, "completions/mean_terminated_length": 251.84375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.08613556804922032, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.024606948718428612, "learning_rate": 7.6756e-06, "loss": 0.2776, "num_tokens": 37617158.0, "reward": 3.967801332473755, "reward_std": 0.18214285373687744, "rewards/reward_fn/mean": 3.967801332473755, "rewards/reward_fn/std": 0.18214282393455505, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1282.0, "completions/max_terminated_length": 1282.0, "completions/mean_length": 566.5, "completions/mean_terminated_length": 566.5, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.08624164633499523, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.028156861662864685, "learning_rate": 7.6752e-06, "loss": 0.0301, "num_tokens": 37669654.0, "reward": 2.7869529724121094, "reward_std": 0.3411664366722107, "rewards/reward_fn/mean": 2.7869529724121094, "rewards/reward_fn/std": 0.3411664366722107, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 137.625, "completions/mean_terminated_length": 137.625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.08634772462077013, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.021783202653750777, "learning_rate": 7.6748e-06, "loss": 0.0009, "num_tokens": 37707338.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 268.15625, "completions/mean_terminated_length": 268.15625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.08645380290654503, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.020849171094596386, "learning_rate": 7.6744e-06, "loss": -0.0546, "num_tokens": 37751279.0, "reward": 3.7785048484802246, "reward_std": 0.6255349516868591, "rewards/reward_fn/mean": 3.7785048484802246, "rewards/reward_fn/std": 0.6255349516868591, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 97.0, "completions/max_terminated_length": 97.0, "completions/mean_length": 66.34375, "completions/mean_terminated_length": 66.34375, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.08655988119231993, "frac_reward_zero_std": 1.0, "grad_norm": 0.115234375, "kl": 0.01903400884475559, "learning_rate": 7.674e-06, "loss": 0.0008, "num_tokens": 37785402.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 249.0625, "completions/mean_terminated_length": 249.0625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.08666595947809483, "frac_reward_zero_std": 1.0, "grad_norm": 0.0751953125, "kl": 0.018705508206039667, "learning_rate": 7.6736e-06, "loss": 0.0007, "num_tokens": 37835932.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1291.0, "completions/max_terminated_length": 1291.0, "completions/mean_length": 434.9375, "completions/mean_terminated_length": 434.9375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.08677203776386974, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.021974525414407253, "learning_rate": 7.6732e-06, "loss": -0.028, "num_tokens": 37887770.0, "reward": 3.078002452850342, "reward_std": 0.9280657172203064, "rewards/reward_fn/mean": 3.078002452850342, "rewards/reward_fn/std": 0.9280656576156616, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 151.3125, "completions/mean_terminated_length": 151.3125, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.08687811604964464, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "kl": 0.02121459529735148, "learning_rate": 7.672799999999999e-06, "loss": 0.0208, "num_tokens": 37912644.0, "reward": 3.916912317276001, "reward_std": 0.2673051953315735, "rewards/reward_fn/mean": 3.916912317276001, "rewards/reward_fn/std": 0.2673051655292511, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1782.0, "completions/max_terminated_length": 1782.0, "completions/mean_length": 596.375, "completions/mean_terminated_length": 596.375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.08698419433541954, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.01679900661110878, "learning_rate": 7.6724e-06, "loss": 0.211, "num_tokens": 37971696.0, "reward": 2.634601593017578, "reward_std": 0.8135941624641418, "rewards/reward_fn/mean": 2.634601593017578, "rewards/reward_fn/std": 0.8135941028594971, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 80.15625, "completions/mean_terminated_length": 80.15625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.08709027262119444, "frac_reward_zero_std": 1.0, "grad_norm": 0.1142578125, "kl": 0.014007492223754525, "learning_rate": 7.671999999999999e-06, "loss": 0.0006, "num_tokens": 37994645.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1119.0, "completions/max_terminated_length": 1119.0, "completions/mean_length": 587.875, "completions/mean_terminated_length": 587.875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.08719635090696934, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.016050471109338105, "learning_rate": 7.6716e-06, "loss": 0.0154, "num_tokens": 38054577.0, "reward": 2.849022388458252, "reward_std": 0.03270436450839043, "rewards/reward_fn/mean": 2.849022388458252, "rewards/reward_fn/std": 0.03270438313484192, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 134.9375, "completions/mean_terminated_length": 134.9375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.08730242919274425, "frac_reward_zero_std": 1.0, "grad_norm": 0.10302734375, "kl": 0.021432526409626007, "learning_rate": 7.671199999999999e-06, "loss": 0.0009, "num_tokens": 38093487.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1193.0, "completions/max_terminated_length": 1193.0, "completions/mean_length": 314.75, "completions/mean_terminated_length": 314.75, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.08740850747851915, "frac_reward_zero_std": 1.0, "grad_norm": 0.06103515625, "kl": 0.012500970042310655, "learning_rate": 7.6708e-06, "loss": 0.0005, "num_tokens": 38138407.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1915.0, "completions/mean_length": 789.125, "completions/mean_terminated_length": 748.51611328125, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.08751458576429405, "frac_reward_zero_std": 0.0, "grad_norm": 0.9375, "kl": 0.014306797063909471, "learning_rate": 7.6704e-06, "loss": -0.0029, "num_tokens": 38237739.0, "reward": 2.5165700912475586, "reward_std": 0.8159104585647583, "rewards/reward_fn/mean": 2.5165700912475586, "rewards/reward_fn/std": 0.8159104585647583, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1498.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 484.15625, "completions/mean_terminated_length": 484.15625, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.08762066405006895, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.015709159546531737, "learning_rate": 7.67e-06, "loss": 0.1323, "num_tokens": 38306224.0, "reward": 2.799229860305786, "reward_std": 0.059028059244155884, "rewards/reward_fn/mean": 2.799229860305786, "rewards/reward_fn/std": 0.059028033167123795, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1373.0, "completions/max_terminated_length": 1373.0, "completions/mean_length": 325.90625, "completions/mean_terminated_length": 325.90625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.08772674233584385, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.021479285322129726, "learning_rate": 7.6696e-06, "loss": -0.0611, "num_tokens": 38359309.0, "reward": 3.9237232208251953, "reward_std": 0.30016380548477173, "rewards/reward_fn/mean": 3.9237232208251953, "rewards/reward_fn/std": 0.3001638352870941, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1149.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 188.53125, "completions/mean_terminated_length": 188.53125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.08783282062161875, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.02761186519637704, "learning_rate": 7.6692e-06, "loss": -0.0688, "num_tokens": 38397758.0, "reward": 3.613966464996338, "reward_std": 0.5824852585792542, "rewards/reward_fn/mean": 3.613966464996338, "rewards/reward_fn/std": 0.5824853181838989, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 178.53125, "completions/mean_terminated_length": 178.53125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.08793889890739366, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.01873406278900802, "learning_rate": 7.6688e-06, "loss": 0.0007, "num_tokens": 38436335.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1652.0, "completions/max_terminated_length": 1652.0, "completions/mean_length": 360.3125, "completions/mean_terminated_length": 360.3125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.08804497719316856, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.02481435379013419, "learning_rate": 7.6684e-06, "loss": -0.0879, "num_tokens": 38459545.0, "reward": 2.863328456878662, "reward_std": 0.4318251311779022, "rewards/reward_fn/mean": 2.863328456878662, "rewards/reward_fn/std": 0.43182510137557983, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 154.5, "completions/mean_terminated_length": 154.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.08815105547894346, "frac_reward_zero_std": 0.0, "grad_norm": 3.265625, "kl": 0.024363160831853747, "learning_rate": 7.668e-06, "loss": 0.125, "num_tokens": 38510345.0, "reward": 3.92673659324646, "reward_std": 0.4144406020641327, "rewards/reward_fn/mean": 3.92673659324646, "rewards/reward_fn/std": 0.4144406318664551, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 254.53125, "completions/mean_terminated_length": 254.53125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.08825713376471836, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.021779751870781183, "learning_rate": 7.6676e-06, "loss": -0.0541, "num_tokens": 38568538.0, "reward": 3.9609336853027344, "reward_std": 0.2209915965795517, "rewards/reward_fn/mean": 3.9609336853027344, "rewards/reward_fn/std": 0.2209915667772293, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 539.09375, "completions/mean_terminated_length": 490.4193420410156, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.08836321205049326, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.01555003086104989, "learning_rate": 7.6672e-06, "loss": 0.1733, "num_tokens": 38624541.0, "reward": 2.5395612716674805, "reward_std": 0.5905151963233948, "rewards/reward_fn/mean": 2.5395612716674805, "rewards/reward_fn/std": 0.5905151963233948, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 550.09375, "completions/mean_terminated_length": 501.774169921875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.08846929033626817, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.011983388103544712, "learning_rate": 7.6668e-06, "loss": 0.1678, "num_tokens": 38679872.0, "reward": 2.6343441009521484, "reward_std": 0.5172379016876221, "rewards/reward_fn/mean": 2.6343441009521484, "rewards/reward_fn/std": 0.5172379016876221, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 458.5, "completions/mean_terminated_length": 458.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.08857536862204307, "frac_reward_zero_std": 1.0, "grad_norm": 0.0458984375, "kl": 0.015996047877706587, "learning_rate": 7.6664e-06, "loss": 0.0006, "num_tokens": 38727152.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1978.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 492.3125, "completions/mean_terminated_length": 492.3125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.08868144690781797, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.017040529986843467, "learning_rate": 7.666e-06, "loss": 0.1084, "num_tokens": 38779738.0, "reward": 3.5246658325195312, "reward_std": 0.5842354893684387, "rewards/reward_fn/mean": 3.5246658325195312, "rewards/reward_fn/std": 0.584235429763794, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 243.46875, "completions/mean_terminated_length": 243.46875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.08878752519359287, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.01812937785871327, "learning_rate": 7.6656e-06, "loss": 0.0007, "num_tokens": 38807785.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 233.59375, "completions/mean_terminated_length": 233.59375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.08889360347936777, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.02154145378153771, "learning_rate": 7.665199999999999e-06, "loss": 0.0009, "num_tokens": 38860764.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1264.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 482.875, "completions/mean_terminated_length": 482.875, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.08899968176514267, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.02117840899154544, "learning_rate": 7.6648e-06, "loss": 0.0261, "num_tokens": 38892632.0, "reward": 2.9815337657928467, "reward_std": 0.5712193846702576, "rewards/reward_fn/mean": 2.9815337657928467, "rewards/reward_fn/std": 0.5712193250656128, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 96.15625, "completions/mean_terminated_length": 96.15625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.08910576005091758, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.012993088574148715, "learning_rate": 7.664399999999999e-06, "loss": 0.0005, "num_tokens": 38921501.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 292.4375, "completions/mean_terminated_length": 292.4375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.08921183833669248, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.023782695876434445, "learning_rate": 7.664e-06, "loss": 0.001, "num_tokens": 38963435.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 251.90625, "completions/mean_terminated_length": 251.90625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.08931791662246738, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.021294391248375177, "learning_rate": 7.663599999999999e-06, "loss": -0.0354, "num_tokens": 39018280.0, "reward": 3.854649305343628, "reward_std": 0.3912486135959625, "rewards/reward_fn/mean": 3.854649305343628, "rewards/reward_fn/std": 0.39124858379364014, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 521.6875, "completions/mean_terminated_length": 521.6875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.08942399490824228, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.02067981311120093, "learning_rate": 7.6632e-06, "loss": 0.0552, "num_tokens": 39071550.0, "reward": 2.960012674331665, "reward_std": 0.3984401524066925, "rewards/reward_fn/mean": 2.960012674331665, "rewards/reward_fn/std": 0.3984401226043701, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1737.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 478.0, "completions/mean_terminated_length": 478.0, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.08953007319401718, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.019556208280846477, "learning_rate": 7.662799999999999e-06, "loss": 0.054, "num_tokens": 39119614.0, "reward": 1.653461217880249, "reward_std": 0.041531752794981, "rewards/reward_fn/mean": 1.653461217880249, "rewards/reward_fn/std": 0.041531749069690704, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 608.5625, "completions/mean_terminated_length": 562.1290283203125, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.08963615147979209, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.017408509156666696, "learning_rate": 7.6624e-06, "loss": 0.1428, "num_tokens": 39176944.0, "reward": 2.5016188621520996, "reward_std": 0.5439700484275818, "rewards/reward_fn/mean": 2.5016188621520996, "rewards/reward_fn/std": 0.5439700484275818, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 98.9375, "completions/mean_terminated_length": 98.9375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.08974222976556699, "frac_reward_zero_std": 0.0, "grad_norm": 2.671875, "kl": 0.015259308856911957, "learning_rate": 7.661999999999999e-06, "loss": 0.0593, "num_tokens": 39214222.0, "reward": 2.9853861331939697, "reward_std": 0.0448482483625412, "rewards/reward_fn/mean": 2.9853861331939697, "rewards/reward_fn/std": 0.04484826698899269, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1274.0, "completions/max_terminated_length": 1274.0, "completions/mean_length": 305.65625, "completions/mean_terminated_length": 305.65625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.08984830805134189, "frac_reward_zero_std": 1.0, "grad_norm": 0.267578125, "kl": 0.020439033047296107, "learning_rate": 7.6616e-06, "loss": 0.0008, "num_tokens": 39262947.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 272.21875, "completions/mean_terminated_length": 272.21875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.08995438633711679, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.016530890949070454, "learning_rate": 7.6612e-06, "loss": 0.0007, "num_tokens": 39315562.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 934.9375, "completions/mean_terminated_length": 860.7333984375, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "epoch": 0.09006046462289169, "frac_reward_zero_std": 0.0, "grad_norm": 1.0234375, "kl": 0.011996885878033936, "learning_rate": 7.6608e-06, "loss": 0.1388, "num_tokens": 39382120.0, "reward": 2.5481839179992676, "reward_std": 0.7321963906288147, "rewards/reward_fn/mean": 2.5481839179992676, "rewards/reward_fn/std": 0.7321963906288147, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1143.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 247.875, "completions/mean_terminated_length": 247.875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.0901665429086666, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.025268358644098043, "learning_rate": 7.6604e-06, "loss": -0.0004, "num_tokens": 39427172.0, "reward": 3.895744800567627, "reward_std": 0.4322659969329834, "rewards/reward_fn/mean": 3.895744800567627, "rewards/reward_fn/std": 0.43226608633995056, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1147.0, "completions/max_terminated_length": 1147.0, "completions/mean_length": 307.03125, "completions/mean_terminated_length": 307.03125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.0902726211944415, "frac_reward_zero_std": 1.0, "grad_norm": 0.06494140625, "kl": 0.018672091653570533, "learning_rate": 7.66e-06, "loss": 0.0007, "num_tokens": 39471077.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 302.4375, "completions/mean_terminated_length": 302.4375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.0903786994802164, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.016184502048417926, "learning_rate": 7.6596e-06, "loss": -0.0154, "num_tokens": 39516083.0, "reward": 3.964545249938965, "reward_std": 0.2005615234375, "rewards/reward_fn/mean": 3.964545249938965, "rewards/reward_fn/std": 0.2005615085363388, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 324.40625, "completions/mean_terminated_length": 268.80645751953125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.0904847777659913, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.01840786065440625, "learning_rate": 7.6592e-06, "loss": 0.3014, "num_tokens": 39581696.0, "reward": 3.629117012023926, "reward_std": 0.9064881205558777, "rewards/reward_fn/mean": 3.629117012023926, "rewards/reward_fn/std": 0.9064880609512329, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1126.0, "completions/max_terminated_length": 1126.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.0905908560517662, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.02637373749166727, "learning_rate": 7.6588e-06, "loss": 0.1283, "num_tokens": 39620544.0, "reward": 2.9511616230010986, "reward_std": 0.08586955070495605, "rewards/reward_fn/mean": 2.9511616230010986, "rewards/reward_fn/std": 0.08586958050727844, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 448.1875, "completions/mean_terminated_length": 448.1875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.0906969343375411, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.01755295612383634, "learning_rate": 7.6584e-06, "loss": 0.0497, "num_tokens": 39678278.0, "reward": 3.100637197494507, "reward_std": 0.4405517578125, "rewards/reward_fn/mean": 3.100637197494507, "rewards/reward_fn/std": 0.4405516982078552, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1361.0, "completions/max_terminated_length": 1361.0, "completions/mean_length": 370.0625, "completions/mean_terminated_length": 370.0625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.09080301262331601, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.02147255139425397, "learning_rate": 7.658e-06, "loss": 0.0513, "num_tokens": 39723720.0, "reward": 2.9578869342803955, "reward_std": 0.45630425214767456, "rewards/reward_fn/mean": 2.9578869342803955, "rewards/reward_fn/std": 0.45630425214767456, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 160.34375, "completions/mean_terminated_length": 160.34375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.09090909090909091, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.02221720013767481, "learning_rate": 7.6576e-06, "loss": 0.0009, "num_tokens": 39767827.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 474.875, "completions/mean_terminated_length": 370.0000305175781, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.09101516919486581, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.02296722703613341, "learning_rate": 7.657199999999998e-06, "loss": 0.1096, "num_tokens": 39814287.0, "reward": 2.8872292041778564, "reward_std": 0.912183940410614, "rewards/reward_fn/mean": 2.8872292041778564, "rewards/reward_fn/std": 0.9121840596199036, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1292.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 396.125, "completions/mean_terminated_length": 396.125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.09112124748064071, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.019654609728604555, "learning_rate": 7.6568e-06, "loss": -0.0741, "num_tokens": 39861555.0, "reward": 3.9295010566711426, "reward_std": 0.39880141615867615, "rewards/reward_fn/mean": 3.9295010566711426, "rewards/reward_fn/std": 0.39880138635635376, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 378.1875, "completions/mean_terminated_length": 378.1875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.09122732576641561, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.023176385322585702, "learning_rate": 7.6564e-06, "loss": 0.029, "num_tokens": 39908185.0, "reward": 2.712822437286377, "reward_std": 0.5235727429389954, "rewards/reward_fn/mean": 2.712822437286377, "rewards/reward_fn/std": 0.5235726833343506, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1054.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 582.9375, "completions/mean_terminated_length": 582.9375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.09133340405219052, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.02485931245610118, "learning_rate": 7.656e-06, "loss": 0.0291, "num_tokens": 39970263.0, "reward": 2.0070159435272217, "reward_std": 0.5709453821182251, "rewards/reward_fn/mean": 2.0070159435272217, "rewards/reward_fn/std": 0.5709454417228699, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1708.0, "completions/mean_length": 727.5, "completions/mean_terminated_length": 684.9031982421875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.09143948233796542, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.021499027032405138, "learning_rate": 7.6556e-06, "loss": 0.1757, "num_tokens": 40033191.0, "reward": 2.741589069366455, "reward_std": 0.7133889198303223, "rewards/reward_fn/mean": 2.741589069366455, "rewards/reward_fn/std": 0.713388979434967, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 166.59375, "completions/mean_terminated_length": 166.59375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.09154556062374032, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.01900175167247653, "learning_rate": 7.655199999999999e-06, "loss": 0.0008, "num_tokens": 40071258.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 138.9375, "completions/mean_terminated_length": 138.9375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.09165163890951522, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.02012345683760941, "learning_rate": 7.6548e-06, "loss": 0.0304, "num_tokens": 40103192.0, "reward": 3.9747822284698486, "reward_std": 0.14265310764312744, "rewards/reward_fn/mean": 3.9747822284698486, "rewards/reward_fn/std": 0.14265307784080505, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 239.40625, "completions/mean_terminated_length": 239.40625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.09175771719529012, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.023207159945741296, "learning_rate": 7.654399999999999e-06, "loss": 0.0592, "num_tokens": 40141413.0, "reward": 3.481161117553711, "reward_std": 0.5994711518287659, "rewards/reward_fn/mean": 3.481161117553711, "rewards/reward_fn/std": 0.5994711518287659, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 184.46875, "completions/mean_terminated_length": 184.46875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.09186379548106502, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.025038019753992558, "learning_rate": 7.654e-06, "loss": 0.001, "num_tokens": 40193268.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 258.125, "completions/mean_terminated_length": 258.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.09196987376683993, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.019463328178972006, "learning_rate": 7.653599999999999e-06, "loss": 0.0152, "num_tokens": 40232248.0, "reward": 2.827080249786377, "reward_std": 0.38547733426094055, "rewards/reward_fn/mean": 2.827080249786377, "rewards/reward_fn/std": 0.38547733426094055, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 183.78125, "completions/mean_terminated_length": 183.78125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.09207595205261483, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.0231759175658226, "learning_rate": 7.6532e-06, "loss": 0.3201, "num_tokens": 40283857.0, "reward": 3.9275426864624023, "reward_std": 0.40988099575042725, "rewards/reward_fn/mean": 3.9275426864624023, "rewards/reward_fn/std": 0.40988096594810486, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1254.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 375.5625, "completions/mean_terminated_length": 375.5625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.09218203033838973, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.02289043739438057, "learning_rate": 7.652799999999999e-06, "loss": 0.0872, "num_tokens": 40329763.0, "reward": 3.6777396202087402, "reward_std": 0.4553356170654297, "rewards/reward_fn/mean": 3.6777396202087402, "rewards/reward_fn/std": 0.4553355872631073, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1893.0, "completions/max_terminated_length": 1893.0, "completions/mean_length": 473.125, "completions/mean_terminated_length": 473.125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.09228810862416463, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.01598887878935784, "learning_rate": 7.6524e-06, "loss": 0.0179, "num_tokens": 40403239.0, "reward": 2.7556214332580566, "reward_std": 0.2854246497154236, "rewards/reward_fn/mean": 2.7556214332580566, "rewards/reward_fn/std": 0.28542467951774597, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 192.84375, "completions/mean_terminated_length": 192.84375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.09239418690993953, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.022107360186055303, "learning_rate": 7.652e-06, "loss": 0.0033, "num_tokens": 40428162.0, "reward": 3.720233201980591, "reward_std": 0.4924321472644806, "rewards/reward_fn/mean": 3.720233201980591, "rewards/reward_fn/std": 0.492432177066803, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1444.0, "completions/max_terminated_length": 1444.0, "completions/mean_length": 297.59375, "completions/mean_terminated_length": 297.59375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.09250026519571444, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.018545285565778613, "learning_rate": 7.6516e-06, "loss": 0.0992, "num_tokens": 40486805.0, "reward": 3.9627604484558105, "reward_std": 0.21065910160541534, "rewards/reward_fn/mean": 3.9627604484558105, "rewards/reward_fn/std": 0.21065913140773773, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 246.0625, "completions/mean_terminated_length": 246.0625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.09260634348148934, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.02543105883523822, "learning_rate": 7.6512e-06, "loss": -0.0205, "num_tokens": 40540087.0, "reward": 3.9671568870544434, "reward_std": 0.18578803539276123, "rewards/reward_fn/mean": 3.9671568870544434, "rewards/reward_fn/std": 0.18578806519508362, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 224.21875, "completions/mean_terminated_length": 224.21875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.09271242176726424, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.018198609352111816, "learning_rate": 7.6508e-06, "loss": 0.0684, "num_tokens": 40569118.0, "reward": 3.9687681198120117, "reward_std": 0.17667338252067566, "rewards/reward_fn/mean": 3.9687681198120117, "rewards/reward_fn/std": 0.17667338252067566, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1499.0, "completions/max_terminated_length": 1499.0, "completions/mean_length": 536.28125, "completions/mean_terminated_length": 536.28125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.09281850005303914, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.01982128922827542, "learning_rate": 7.6504e-06, "loss": -0.0039, "num_tokens": 40620615.0, "reward": 2.6173102855682373, "reward_std": 0.29557812213897705, "rewards/reward_fn/mean": 2.6173102855682373, "rewards/reward_fn/std": 0.29557812213897705, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1640.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 477.5, "completions/mean_terminated_length": 477.5, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.09292457833881404, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.02005974156782031, "learning_rate": 7.65e-06, "loss": 0.0006, "num_tokens": 40666359.0, "reward": 2.715482473373413, "reward_std": 0.26461300253868103, "rewards/reward_fn/mean": 2.715482473373413, "rewards/reward_fn/std": 0.2646130323410034, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 324.375, "completions/mean_terminated_length": 324.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.09303065662458895, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.023724806495010853, "learning_rate": 7.6496e-06, "loss": -0.0251, "num_tokens": 40712195.0, "reward": 2.6783430576324463, "reward_std": 0.43827834725379944, "rewards/reward_fn/mean": 2.6783430576324463, "rewards/reward_fn/std": 0.43827834725379944, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 609.21875, "completions/mean_terminated_length": 609.21875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.09313673491036385, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.015918764751404524, "learning_rate": 7.6492e-06, "loss": 0.2026, "num_tokens": 40765418.0, "reward": 2.542436122894287, "reward_std": 0.3645319640636444, "rewards/reward_fn/mean": 2.542436122894287, "rewards/reward_fn/std": 0.3645319640636444, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 329.6875, "completions/mean_terminated_length": 329.6875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.09324281319613875, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.020563342375680804, "learning_rate": 7.648799999999999e-06, "loss": -0.0037, "num_tokens": 40812096.0, "reward": 3.659517526626587, "reward_std": 0.78047776222229, "rewards/reward_fn/mean": 3.659517526626587, "rewards/reward_fn/std": 0.7804778218269348, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1857.0, "completions/max_terminated_length": 1857.0, "completions/mean_length": 410.0, "completions/mean_terminated_length": 410.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.09334889148191365, "frac_reward_zero_std": 0.0, "grad_norm": 1.5859375, "kl": 0.01993569522164762, "learning_rate": 7.6484e-06, "loss": 0.2059, "num_tokens": 40862816.0, "reward": 3.4595108032226562, "reward_std": 0.9185887575149536, "rewards/reward_fn/mean": 3.4595108032226562, "rewards/reward_fn/std": 0.9185887575149536, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 205.53125, "completions/mean_terminated_length": 205.53125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.09345496976768855, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.02375972643494606, "learning_rate": 7.647999999999999e-06, "loss": -0.0053, "num_tokens": 40906737.0, "reward": 3.0442795753479004, "reward_std": 0.4683447778224945, "rewards/reward_fn/mean": 3.0442795753479004, "rewards/reward_fn/std": 0.4683447480201721, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1041.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 371.21875, "completions/mean_terminated_length": 371.21875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.09356104805346345, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.02078143460676074, "learning_rate": 7.6476e-06, "loss": 0.0293, "num_tokens": 40958680.0, "reward": 2.791837692260742, "reward_std": 0.0601964108645916, "rewards/reward_fn/mean": 2.791837692260742, "rewards/reward_fn/std": 0.06019642949104309, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.09366712633923836, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.014467753586359322, "learning_rate": 7.6472e-06, "loss": 0.0468, "num_tokens": 41006028.0, "reward": 3.193929672241211, "reward_std": 0.0326162613928318, "rewards/reward_fn/mean": 3.193929672241211, "rewards/reward_fn/std": 0.03261625021696091, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 254.7096710205078, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.09377320462501326, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.026728518772870302, "learning_rate": 7.6468e-06, "loss": 0.3628, "num_tokens": 41045572.0, "reward": 3.0327024459838867, "reward_std": 0.8356220722198486, "rewards/reward_fn/mean": 3.0327024459838867, "rewards/reward_fn/std": 0.8356220126152039, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 200.5, "completions/mean_terminated_length": 200.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.09387928291078816, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.020390488440170884, "learning_rate": 7.6464e-06, "loss": 0.0299, "num_tokens": 41089652.0, "reward": 3.96085786819458, "reward_std": 0.15403839945793152, "rewards/reward_fn/mean": 3.96085786819458, "rewards/reward_fn/std": 0.1540384292602539, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1878.0, "completions/mean_length": 977.59375, "completions/mean_terminated_length": 943.0645141601562, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.09398536119656306, "frac_reward_zero_std": 0.0, "grad_norm": 1.078125, "kl": 0.017107663094066083, "learning_rate": 7.646e-06, "loss": 0.0785, "num_tokens": 41176263.0, "reward": 2.5095417499542236, "reward_std": 0.5657038688659668, "rewards/reward_fn/mean": 2.5095417499542236, "rewards/reward_fn/std": 0.5657038688659668, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1121.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 222.34375, "completions/mean_terminated_length": 222.34375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.09409143948233796, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.01801448129117489, "learning_rate": 7.6456e-06, "loss": -0.0125, "num_tokens": 41209234.0, "reward": 2.8958828449249268, "reward_std": 0.20669691264629364, "rewards/reward_fn/mean": 2.8958828449249268, "rewards/reward_fn/std": 0.20669691264629364, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 300.0, "completions/mean_terminated_length": 300.0, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.09419751776811287, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.015553548815660179, "learning_rate": 7.6452e-06, "loss": 0.0764, "num_tokens": 41254418.0, "reward": 3.924589157104492, "reward_std": 0.4265884757041931, "rewards/reward_fn/mean": 3.924589157104492, "rewards/reward_fn/std": 0.4265885055065155, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 157.1875, "completions/mean_terminated_length": 157.1875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.09430359605388777, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.03170281834900379, "learning_rate": 7.6448e-06, "loss": 0.0013, "num_tokens": 41298296.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 204.5, "completions/mean_terminated_length": 204.5, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.09440967433966267, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.03371644695289433, "learning_rate": 7.6444e-06, "loss": 0.0301, "num_tokens": 41342216.0, "reward": 3.099053382873535, "reward_std": 0.3049076199531555, "rewards/reward_fn/mean": 3.099053382873535, "rewards/reward_fn/std": 0.3049076497554779, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 184.71875, "completions/mean_terminated_length": 184.71875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.09451575262543757, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.02772973943501711, "learning_rate": 7.644e-06, "loss": 0.1017, "num_tokens": 41380127.0, "reward": 3.9672179222106934, "reward_std": 0.18544383347034454, "rewards/reward_fn/mean": 3.9672179222106934, "rewards/reward_fn/std": 0.18544386327266693, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 252.875, "completions/mean_terminated_length": 252.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.09462183091121247, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.022775678196921945, "learning_rate": 7.643599999999999e-06, "loss": 0.0009, "num_tokens": 41434459.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 463.28125, "completions/mean_terminated_length": 412.1612854003906, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.09472790919698737, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.022876902716234326, "learning_rate": 7.6432e-06, "loss": 0.2348, "num_tokens": 41483044.0, "reward": 3.818485736846924, "reward_std": 0.7323809266090393, "rewards/reward_fn/mean": 3.818485736846924, "rewards/reward_fn/std": 0.7323809266090393, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1393.0, "completions/max_terminated_length": 1393.0, "completions/mean_length": 417.28125, "completions/mean_terminated_length": 417.28125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.09483398748276228, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.02479085512459278, "learning_rate": 7.6428e-06, "loss": -0.0023, "num_tokens": 41526541.0, "reward": 3.401907444000244, "reward_std": 0.5439134240150452, "rewards/reward_fn/mean": 3.401907444000244, "rewards/reward_fn/std": 0.5439134240150452, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 137.875, "completions/mean_terminated_length": 137.875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.09494006576853718, "frac_reward_zero_std": 1.0, "grad_norm": 0.27734375, "kl": 0.03966691764071584, "learning_rate": 7.6424e-06, "loss": 0.0016, "num_tokens": 41574281.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 280.8125, "completions/mean_terminated_length": 280.8125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.09504614405431208, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.022205424029380083, "learning_rate": 7.642e-06, "loss": -0.0289, "num_tokens": 41623075.0, "reward": 3.148716688156128, "reward_std": 0.39955171942710876, "rewards/reward_fn/mean": 3.148716688156128, "rewards/reward_fn/std": 0.3995516896247864, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1307.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 313.78125, "completions/mean_terminated_length": 313.78125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.09515222234008698, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.02836911752820015, "learning_rate": 7.6416e-06, "loss": 0.0118, "num_tokens": 41669948.0, "reward": 3.4885735511779785, "reward_std": 0.6286227107048035, "rewards/reward_fn/mean": 3.4885735511779785, "rewards/reward_fn/std": 0.6286226511001587, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 356.21875, "completions/mean_terminated_length": 356.21875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.09525830062586188, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.018277009017765522, "learning_rate": 7.6412e-06, "loss": 0.0616, "num_tokens": 41715267.0, "reward": 3.8610944747924805, "reward_std": 0.3736887574195862, "rewards/reward_fn/mean": 3.8610944747924805, "rewards/reward_fn/std": 0.3736887574195862, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 187.0, "completions/max_terminated_length": 187.0, "completions/mean_length": 137.5625, "completions/mean_terminated_length": 137.5625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.0953643789116368, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.023039878346025944, "learning_rate": 7.6408e-06, "loss": -0.0044, "num_tokens": 41749525.0, "reward": 2.80351185798645, "reward_std": 0.03616482764482498, "rewards/reward_fn/mean": 2.80351185798645, "rewards/reward_fn/std": 0.036164846271276474, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1680.0, "completions/mean_length": 790.65625, "completions/mean_terminated_length": 706.8333740234375, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.09547045719741169, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.02036685636267066, "learning_rate": 7.640399999999999e-06, "loss": 0.1461, "num_tokens": 41817514.0, "reward": 1.921440839767456, "reward_std": 0.7092944383621216, "rewards/reward_fn/mean": 1.921440839767456, "rewards/reward_fn/std": 0.7092943787574768, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 280.15625, "completions/mean_terminated_length": 280.15625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.09557653548318659, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.01943918946199119, "learning_rate": 7.64e-06, "loss": 0.0304, "num_tokens": 41864623.0, "reward": 3.3483524322509766, "reward_std": 1.0586309432983398, "rewards/reward_fn/mean": 3.3483524322509766, "rewards/reward_fn/std": 1.0586309432983398, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1026.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 226.78125, "completions/mean_terminated_length": 226.78125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.09568261376896149, "frac_reward_zero_std": 1.0, "grad_norm": 0.11669921875, "kl": 0.02719919686205685, "learning_rate": 7.639599999999999e-06, "loss": 0.0011, "num_tokens": 41918024.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 323.78125, "completions/mean_terminated_length": 323.78125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.09578869205473639, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.028861571103334427, "learning_rate": 7.6392e-06, "loss": -0.0134, "num_tokens": 41965441.0, "reward": 2.796419143676758, "reward_std": 0.3470582664012909, "rewards/reward_fn/mean": 2.796419143676758, "rewards/reward_fn/std": 0.3470582365989685, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 253.96875, "completions/mean_terminated_length": 253.96875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.0958947703405113, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.022272631525993347, "learning_rate": 7.638799999999999e-06, "loss": -0.0688, "num_tokens": 42010112.0, "reward": 3.690798282623291, "reward_std": 0.5035431981086731, "rewards/reward_fn/mean": 3.690798282623291, "rewards/reward_fn/std": 0.5035431981086731, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1235.0, "completions/max_terminated_length": 1235.0, "completions/mean_length": 385.0625, "completions/mean_terminated_length": 385.0625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.0960008486262862, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.031112836906686425, "learning_rate": 7.6384e-06, "loss": 0.0479, "num_tokens": 42074306.0, "reward": 2.775426149368286, "reward_std": 0.4809582829475403, "rewards/reward_fn/mean": 2.775426149368286, "rewards/reward_fn/std": 0.48095831274986267, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 313.6875, "completions/mean_terminated_length": 313.6875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.0961069269120611, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.020309182349592447, "learning_rate": 7.638e-06, "loss": -0.0108, "num_tokens": 42123544.0, "reward": 2.7922677993774414, "reward_std": 0.33847782015800476, "rewards/reward_fn/mean": 2.7922677993774414, "rewards/reward_fn/std": 0.33847787976264954, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1131.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 415.65625, "completions/mean_terminated_length": 415.65625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.096213005197836, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.02549328119494021, "learning_rate": 7.6376e-06, "loss": 0.0944, "num_tokens": 42171149.0, "reward": 3.443866729736328, "reward_std": 0.6673449277877808, "rewards/reward_fn/mean": 3.443866729736328, "rewards/reward_fn/std": 0.6673449277877808, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 140.0, "completions/mean_terminated_length": 140.0, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.0963190834836109, "frac_reward_zero_std": 1.0, "grad_norm": 0.134765625, "kl": 0.027116876328364015, "learning_rate": 7.6372e-06, "loss": 0.0011, "num_tokens": 42206669.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 265.84375, "completions/mean_terminated_length": 265.84375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.0964251617693858, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.01707066618837416, "learning_rate": 7.6368e-06, "loss": 0.0234, "num_tokens": 42253128.0, "reward": 2.9526233673095703, "reward_std": 0.3429475724697113, "rewards/reward_fn/mean": 2.9526233673095703, "rewards/reward_fn/std": 0.3429475724697113, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 179.0, "completions/max_terminated_length": 179.0, "completions/mean_length": 121.71875, "completions/mean_terminated_length": 121.71875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.09653124005516071, "frac_reward_zero_std": 1.0, "grad_norm": 0.19140625, "kl": 0.029393920907750726, "learning_rate": 7.6364e-06, "loss": 0.0012, "num_tokens": 42296863.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1530.0, "completions/max_terminated_length": 1530.0, "completions/mean_length": 290.84375, "completions/mean_terminated_length": 290.84375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.09663731834093561, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.02383925556205213, "learning_rate": 7.636e-06, "loss": 0.0789, "num_tokens": 42336154.0, "reward": 2.788799524307251, "reward_std": 0.29157719016075134, "rewards/reward_fn/mean": 2.788799524307251, "rewards/reward_fn/std": 0.29157716035842896, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 230.71875, "completions/mean_terminated_length": 230.71875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.09674339662671051, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.027955673867836595, "learning_rate": 7.6356e-06, "loss": -0.0334, "num_tokens": 42378865.0, "reward": 2.8833703994750977, "reward_std": 0.21842359006404877, "rewards/reward_fn/mean": 2.8833703994750977, "rewards/reward_fn/std": 0.21842356026172638, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 169.0, "completions/mean_terminated_length": 169.0, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.09684947491248541, "frac_reward_zero_std": 1.0, "grad_norm": 0.11962890625, "kl": 0.030652977991849184, "learning_rate": 7.6352e-06, "loss": 0.0012, "num_tokens": 42412497.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 320.3125, "completions/mean_terminated_length": 320.3125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.09695555319826031, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.027334638172760606, "learning_rate": 7.6348e-06, "loss": -0.0232, "num_tokens": 42459707.0, "reward": 3.9664275646209717, "reward_std": 0.18991468846797943, "rewards/reward_fn/mean": 3.9664275646209717, "rewards/reward_fn/std": 0.18991467356681824, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 287.5, "completions/mean_terminated_length": 287.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.09706163148403522, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.026974455220624804, "learning_rate": 7.6344e-06, "loss": 0.0048, "num_tokens": 42512107.0, "reward": 2.9059715270996094, "reward_std": 0.3558712303638458, "rewards/reward_fn/mean": 2.9059715270996094, "rewards/reward_fn/std": 0.3558712303638458, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 261.1875, "completions/mean_terminated_length": 261.1875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.09716770976981012, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.024873450631275773, "learning_rate": 7.634e-06, "loss": 0.0466, "num_tokens": 42561489.0, "reward": 3.0658812522888184, "reward_std": 0.45738157629966736, "rewards/reward_fn/mean": 3.0658812522888184, "rewards/reward_fn/std": 0.45738163590431213, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 134.875, "completions/mean_terminated_length": 134.875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.09727378805558502, "frac_reward_zero_std": 1.0, "grad_norm": 0.17578125, "kl": 0.02832574676722288, "learning_rate": 7.6336e-06, "loss": 0.0011, "num_tokens": 42610765.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 276.375, "completions/mean_terminated_length": 276.375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.09737986634135992, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.022886406630277634, "learning_rate": 7.6332e-06, "loss": -0.0003, "num_tokens": 42658553.0, "reward": 3.8918814659118652, "reward_std": 0.4476149380207062, "rewards/reward_fn/mean": 3.8918814659118652, "rewards/reward_fn/std": 0.4476148784160614, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 377.375, "completions/mean_terminated_length": 377.375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.09748594462713482, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.014997152611613274, "learning_rate": 7.6328e-06, "loss": 0.0899, "num_tokens": 42718917.0, "reward": 3.833319664001465, "reward_std": 0.44810566306114197, "rewards/reward_fn/mean": 3.833319664001465, "rewards/reward_fn/std": 0.4481056332588196, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 214.65625, "completions/mean_terminated_length": 214.65625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.09759202291290972, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.026748921489343047, "learning_rate": 7.6324e-06, "loss": 0.0011, "num_tokens": 42766810.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 277.9375, "completions/mean_terminated_length": 277.9375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.09769810119868463, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.029225841630250216, "learning_rate": 7.631999999999999e-06, "loss": -0.0142, "num_tokens": 42812408.0, "reward": 3.3031575679779053, "reward_std": 0.5849432945251465, "rewards/reward_fn/mean": 3.3031575679779053, "rewards/reward_fn/std": 0.5849433541297913, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 72.1875, "completions/mean_terminated_length": 72.1875, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.09780417948445953, "frac_reward_zero_std": 1.0, "grad_norm": 0.228515625, "kl": 0.01937575329793617, "learning_rate": 7.6316e-06, "loss": 0.0008, "num_tokens": 42856382.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 300.40625, "completions/mean_terminated_length": 300.40625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.09791025777023443, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.04207867290824652, "learning_rate": 7.631199999999999e-06, "loss": 0.0742, "num_tokens": 42903371.0, "reward": 3.767624855041504, "reward_std": 0.5340158939361572, "rewards/reward_fn/mean": 3.767624855041504, "rewards/reward_fn/std": 0.5340158343315125, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 324.9375, "completions/mean_terminated_length": 324.9375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.09801633605600933, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.027455645380541682, "learning_rate": 7.6308e-06, "loss": -0.0995, "num_tokens": 42947977.0, "reward": 3.0452606678009033, "reward_std": 0.368459552526474, "rewards/reward_fn/mean": 3.0452606678009033, "rewards/reward_fn/std": 0.368459552526474, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 472.5, "completions/mean_terminated_length": 421.6773986816406, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.09812241434178423, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.01985622337087989, "learning_rate": 7.630399999999999e-06, "loss": 0.2311, "num_tokens": 42978681.0, "reward": 2.82580828666687, "reward_std": 0.7253391146659851, "rewards/reward_fn/mean": 2.82580828666687, "rewards/reward_fn/std": 0.7253391742706299, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 411.09375, "completions/mean_terminated_length": 411.09375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.09822849262755914, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.03032882115803659, "learning_rate": 7.63e-06, "loss": 0.0001, "num_tokens": 43022940.0, "reward": 3.6942696571350098, "reward_std": 0.6968668699264526, "rewards/reward_fn/mean": 3.6942696571350098, "rewards/reward_fn/std": 0.6968669295310974, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 888.0, "completions/max_terminated_length": 888.0, "completions/mean_length": 238.1875, "completions/mean_terminated_length": 238.1875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.09833457091333404, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.035402664449065924, "learning_rate": 7.629599999999999e-06, "loss": 0.0078, "num_tokens": 43059234.0, "reward": 3.816627025604248, "reward_std": 0.39149269461631775, "rewards/reward_fn/mean": 3.816627025604248, "rewards/reward_fn/std": 0.39149269461631775, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 207.5625, "completions/mean_terminated_length": 207.5625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.09844064919910894, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.025182449258863926, "learning_rate": 7.6292e-06, "loss": 0.013, "num_tokens": 43098900.0, "reward": 3.970850944519043, "reward_std": 0.16489259898662567, "rewards/reward_fn/mean": 3.970850944519043, "rewards/reward_fn/std": 0.16489258408546448, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1068.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 260.90625, "completions/mean_terminated_length": 260.90625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.09854672748488384, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.0351427448913455, "learning_rate": 7.6288e-06, "loss": -0.0097, "num_tokens": 43140209.0, "reward": 3.0380711555480957, "reward_std": 0.37293097376823425, "rewards/reward_fn/mean": 3.0380711555480957, "rewards/reward_fn/std": 0.37293094396591187, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1932.0, "completions/mean_length": 742.28125, "completions/mean_terminated_length": 655.2333374023438, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.09865280577065874, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.019054226577281952, "learning_rate": 7.6284e-06, "loss": 0.31, "num_tokens": 43201562.0, "reward": 2.4119246006011963, "reward_std": 0.6986344456672668, "rewards/reward_fn/mean": 2.4119246006011963, "rewards/reward_fn/std": 0.6986343860626221, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 144.28125, "completions/mean_terminated_length": 144.28125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.09875888405643365, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.02223593066446483, "learning_rate": 7.628e-06, "loss": 0.0009, "num_tokens": 43242531.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1766.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 392.96875, "completions/mean_terminated_length": 392.96875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.09886496234220855, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.019913258031010628, "learning_rate": 7.6276e-06, "loss": 0.0549, "num_tokens": 43291458.0, "reward": 3.416562080383301, "reward_std": 0.694709300994873, "rewards/reward_fn/mean": 3.416562080383301, "rewards/reward_fn/std": 0.694709300994873, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 476.09375, "completions/mean_terminated_length": 425.3870849609375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.09897104062798345, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.022749242838472128, "learning_rate": 7.6272e-06, "loss": 0.2832, "num_tokens": 43340805.0, "reward": 3.8043880462646484, "reward_std": 0.7476766109466553, "rewards/reward_fn/mean": 3.8043880462646484, "rewards/reward_fn/std": 0.7476766109466553, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 560.71875, "completions/mean_terminated_length": 461.5666809082031, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.09907711891375835, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.02255812706425786, "learning_rate": 7.6267999999999996e-06, "loss": 0.1704, "num_tokens": 43409628.0, "reward": 3.572598934173584, "reward_std": 1.1030330657958984, "rewards/reward_fn/mean": 3.572598934173584, "rewards/reward_fn/std": 1.1030330657958984, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 477.0, "completions/mean_terminated_length": 477.0, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.09918319719953325, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.017629666137509048, "learning_rate": 7.6263999999999995e-06, "loss": -0.0552, "num_tokens": 43443996.0, "reward": 2.8076069355010986, "reward_std": 0.29366716742515564, "rewards/reward_fn/mean": 2.8076069355010986, "rewards/reward_fn/std": 0.29366716742515564, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1123.0, "completions/max_terminated_length": 1123.0, "completions/mean_length": 305.6875, "completions/mean_terminated_length": 305.6875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.09928927548530815, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.02135719486977905, "learning_rate": 7.626e-06, "loss": 0.0887, "num_tokens": 43491442.0, "reward": 3.76924991607666, "reward_std": 0.48810887336730957, "rewards/reward_fn/mean": 3.76924991607666, "rewards/reward_fn/std": 0.4881088435649872, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 144.9375, "completions/mean_terminated_length": 144.9375, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.09939535377108306, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.024941423209384084, "learning_rate": 7.6256e-06, "loss": 0.001, "num_tokens": 43530480.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 505.1875, "completions/mean_terminated_length": 505.1875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.09950143205685796, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.0277643243316561, "learning_rate": 7.6252e-06, "loss": 0.0105, "num_tokens": 43579542.0, "reward": 2.689387321472168, "reward_std": 0.519531786441803, "rewards/reward_fn/mean": 2.689387321472168, "rewards/reward_fn/std": 0.519531786441803, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 156.46875, "completions/mean_terminated_length": 156.46875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.09960751034263286, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.024052355904132128, "learning_rate": 7.624799999999999e-06, "loss": 0.0352, "num_tokens": 43613061.0, "reward": 3.1143696308135986, "reward_std": 0.43374985456466675, "rewards/reward_fn/mean": 3.1143696308135986, "rewards/reward_fn/std": 0.43374985456466675, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 422.25, "completions/mean_terminated_length": 422.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.09971358862840776, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.027861488750204444, "learning_rate": 7.624399999999999e-06, "loss": 0.269, "num_tokens": 43657805.0, "reward": 2.7488107681274414, "reward_std": 0.2731390595436096, "rewards/reward_fn/mean": 2.7488107681274414, "rewards/reward_fn/std": 0.27313902974128723, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 310.46875, "completions/mean_terminated_length": 310.46875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.09981966691418266, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.01709406217560172, "learning_rate": 7.623999999999999e-06, "loss": -0.0491, "num_tokens": 43689948.0, "reward": 3.244318723678589, "reward_std": 0.6297351121902466, "rewards/reward_fn/mean": 3.244318723678589, "rewards/reward_fn/std": 0.6297351121902466, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 236.59375, "completions/mean_terminated_length": 236.59375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.09992574519995757, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.029757092706859112, "learning_rate": 7.623599999999999e-06, "loss": -0.0306, "num_tokens": 43729743.0, "reward": 3.699985980987549, "reward_std": 0.48831066489219666, "rewards/reward_fn/mean": 3.699985980987549, "rewards/reward_fn/std": 0.48831063508987427, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1085.0, "completions/max_terminated_length": 1085.0, "completions/mean_length": 359.71875, "completions/mean_terminated_length": 359.71875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.10003182348573247, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.0185692491941154, "learning_rate": 7.623199999999999e-06, "loss": 0.0006, "num_tokens": 43781254.0, "reward": 3.932116985321045, "reward_std": 0.38400447368621826, "rewards/reward_fn/mean": 3.932116985321045, "rewards/reward_fn/std": 0.38400450348854065, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 215.125, "completions/mean_terminated_length": 215.125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.10013790177150737, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.022619884461164474, "learning_rate": 7.622799999999999e-06, "loss": 0.0009, "num_tokens": 43821258.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 129.46875, "completions/mean_terminated_length": 129.46875, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.10024398005728227, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.030272313859313726, "learning_rate": 7.622399999999999e-06, "loss": -0.0063, "num_tokens": 43859129.0, "reward": 3.9288439750671387, "reward_std": 0.28039026260375977, "rewards/reward_fn/mean": 3.9288439750671387, "rewards/reward_fn/std": 0.2803902328014374, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1575.0, "completions/max_terminated_length": 1575.0, "completions/mean_length": 345.75, "completions/mean_terminated_length": 345.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.10035005834305717, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.022123834351077676, "learning_rate": 7.621999999999999e-06, "loss": -0.064, "num_tokens": 43899505.0, "reward": 2.691455364227295, "reward_std": 0.3160874843597412, "rewards/reward_fn/mean": 2.691455364227295, "rewards/reward_fn/std": 0.3160874545574188, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 164.59375, "completions/mean_terminated_length": 164.59375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.10045613662883207, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.027731532929465175, "learning_rate": 7.6216e-06, "loss": 0.0011, "num_tokens": 43945348.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 211.8125, "completions/mean_terminated_length": 211.8125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.10056221491460698, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.0301664131693542, "learning_rate": 7.6212e-06, "loss": -0.0287, "num_tokens": 43992798.0, "reward": 3.930818796157837, "reward_std": 0.3913477957248688, "rewards/reward_fn/mean": 3.930818796157837, "rewards/reward_fn/std": 0.391347736120224, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1828.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 496.125, "completions/mean_terminated_length": 496.125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.10066829320038188, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.019181253854185343, "learning_rate": 7.6208e-06, "loss": 0.0394, "num_tokens": 44044322.0, "reward": 3.3273301124572754, "reward_std": 0.7396343946456909, "rewards/reward_fn/mean": 3.3273301124572754, "rewards/reward_fn/std": 0.7396343946456909, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1993.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 602.75, "completions/mean_terminated_length": 602.75, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.10077437148615678, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.028830960392951965, "learning_rate": 7.6204e-06, "loss": -0.0344, "num_tokens": 44096154.0, "reward": 1.9313682317733765, "reward_std": 0.4274498224258423, "rewards/reward_fn/mean": 1.9313682317733765, "rewards/reward_fn/std": 0.4274497926235199, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1048.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 310.3125, "completions/mean_terminated_length": 310.3125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.10088044977193168, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.025494011351838708, "learning_rate": 7.62e-06, "loss": 0.1634, "num_tokens": 44178660.0, "reward": 3.572533369064331, "reward_std": 0.6003016233444214, "rewards/reward_fn/mean": 3.572533369064331, "rewards/reward_fn/std": 0.6003016829490662, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 385.1875, "completions/mean_terminated_length": 385.1875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.10098652805770658, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.021688956068828702, "learning_rate": 7.6196e-06, "loss": 0.0484, "num_tokens": 44213994.0, "reward": 3.7324137687683105, "reward_std": 0.669641375541687, "rewards/reward_fn/mean": 3.7324137687683105, "rewards/reward_fn/std": 0.669641375541687, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 322.71875, "completions/mean_terminated_length": 322.71875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.1010926063434815, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.020461396779865026, "learning_rate": 7.6192e-06, "loss": -0.0531, "num_tokens": 44267777.0, "reward": 3.1110587120056152, "reward_std": 0.43503865599632263, "rewards/reward_fn/mean": 3.1110587120056152, "rewards/reward_fn/std": 0.43503862619400024, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 244.84375, "completions/mean_terminated_length": 244.84375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.1011986846292564, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.021231455844826996, "learning_rate": 7.6188e-06, "loss": 0.0719, "num_tokens": 44324988.0, "reward": 3.789945602416992, "reward_std": 0.5255440473556519, "rewards/reward_fn/mean": 3.789945602416992, "rewards/reward_fn/std": 0.5255439877510071, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 539.4375, "completions/mean_terminated_length": 490.774169921875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.1013047629150313, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.019856604980304837, "learning_rate": 7.6184e-06, "loss": 0.0295, "num_tokens": 44379306.0, "reward": 1.9409844875335693, "reward_std": 0.5565598011016846, "rewards/reward_fn/mean": 1.9409844875335693, "rewards/reward_fn/std": 0.5565597414970398, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 499.71875, "completions/mean_terminated_length": 449.774169921875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.10141084120080619, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.025161937111988664, "learning_rate": 7.618e-06, "loss": 0.1219, "num_tokens": 44439393.0, "reward": 2.661538600921631, "reward_std": 0.39979878067970276, "rewards/reward_fn/mean": 2.661538600921631, "rewards/reward_fn/std": 0.399798721075058, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1571.0, "completions/max_terminated_length": 1571.0, "completions/mean_length": 397.9375, "completions/mean_terminated_length": 397.9375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.10151691948658109, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.027824259363114834, "learning_rate": 7.6176e-06, "loss": -0.0297, "num_tokens": 44486335.0, "reward": 2.802313804626465, "reward_std": 0.21157807111740112, "rewards/reward_fn/mean": 2.802313804626465, "rewards/reward_fn/std": 0.21157805621623993, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 458.6875, "completions/mean_terminated_length": 458.6875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.101622997772356, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.01801618025638163, "learning_rate": 7.6172e-06, "loss": 0.0008, "num_tokens": 44541941.0, "reward": 2.776561737060547, "reward_std": 0.20328201353549957, "rewards/reward_fn/mean": 2.776561737060547, "rewards/reward_fn/std": 0.20328204333782196, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1326.0, "completions/max_terminated_length": 1326.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 260.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.1017290760581309, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.03001307207159698, "learning_rate": 7.6168e-06, "loss": -0.0721, "num_tokens": 44599129.0, "reward": 3.648618221282959, "reward_std": 0.493778258562088, "rewards/reward_fn/mean": 3.648618221282959, "rewards/reward_fn/std": 0.4937782883644104, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 149.3125, "completions/mean_terminated_length": 149.3125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.1018351543439058, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.03182677808217704, "learning_rate": 7.6164e-06, "loss": -0.0079, "num_tokens": 44627427.0, "reward": 3.9289231300354004, "reward_std": 0.4020720422267914, "rewards/reward_fn/mean": 3.9289231300354004, "rewards/reward_fn/std": 0.4020719826221466, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 195.65625, "completions/mean_terminated_length": 195.65625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.1019412326296807, "frac_reward_zero_std": 1.0, "grad_norm": 0.08935546875, "kl": 0.025319629814475775, "learning_rate": 7.616e-06, "loss": 0.001, "num_tokens": 44652216.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 155.125, "completions/mean_terminated_length": 155.125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.1020473109154556, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.03323507239110768, "learning_rate": 7.6155999999999996e-06, "loss": 0.1249, "num_tokens": 44690076.0, "reward": 2.9434142112731934, "reward_std": 0.04483083263039589, "rewards/reward_fn/mean": 2.9434142112731934, "rewards/reward_fn/std": 0.044830840080976486, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 269.8125, "completions/mean_terminated_length": 269.8125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.1021533892012305, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.04052834562025964, "learning_rate": 7.6151999999999995e-06, "loss": 0.0596, "num_tokens": 44742006.0, "reward": 3.814697265625, "reward_std": 0.4385845363140106, "rewards/reward_fn/mean": 3.814697265625, "rewards/reward_fn/std": 0.43858450651168823, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 337.90625, "completions/mean_terminated_length": 337.90625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.10225946748700541, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.027457768563181162, "learning_rate": 7.6147999999999995e-06, "loss": 0.0372, "num_tokens": 44779315.0, "reward": 3.6313183307647705, "reward_std": 0.5176029205322266, "rewards/reward_fn/mean": 3.6313183307647705, "rewards/reward_fn/std": 0.5176029801368713, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1231.0, "completions/max_terminated_length": 1231.0, "completions/mean_length": 328.5, "completions/mean_terminated_length": 328.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.10236554577278031, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.023238977417349815, "learning_rate": 7.6143999999999995e-06, "loss": 0.0578, "num_tokens": 44820835.0, "reward": 2.8203535079956055, "reward_std": 0.028799260035157204, "rewards/reward_fn/mean": 2.8203535079956055, "rewards/reward_fn/std": 0.028799280524253845, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1401.0, "completions/mean_length": 591.9375, "completions/mean_terminated_length": 544.9677124023438, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.10247162405855521, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.018361143651418388, "learning_rate": 7.6139999999999994e-06, "loss": 0.212, "num_tokens": 44876033.0, "reward": 2.5604774951934814, "reward_std": 0.6667385101318359, "rewards/reward_fn/mean": 2.5604774951934814, "rewards/reward_fn/std": 0.6667385101318359, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 167.6875, "completions/mean_terminated_length": 167.6875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.10257770234433011, "frac_reward_zero_std": 1.0, "grad_norm": 0.1796875, "kl": 0.028179120272397995, "learning_rate": 7.613599999999999e-06, "loss": 0.0011, "num_tokens": 44924119.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 346.5, "completions/mean_terminated_length": 346.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.10268378063010501, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.027506219688802958, "learning_rate": 7.613199999999999e-06, "loss": 0.0885, "num_tokens": 44979431.0, "reward": 2.5675511360168457, "reward_std": 0.48671212792396545, "rewards/reward_fn/mean": 2.5675511360168457, "rewards/reward_fn/std": 0.48671212792396545, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1400.0, "completions/max_terminated_length": 1400.0, "completions/mean_length": 320.90625, "completions/mean_terminated_length": 320.90625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.10278985891587993, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.02348879328928888, "learning_rate": 7.612799999999999e-06, "loss": 0.0561, "num_tokens": 45025924.0, "reward": 3.9348433017730713, "reward_std": 0.25716766715049744, "rewards/reward_fn/mean": 3.9348433017730713, "rewards/reward_fn/std": 0.25716766715049744, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 348.03125, "completions/mean_terminated_length": 293.19354248046875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.10289593720165482, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.02728769346140325, "learning_rate": 7.612399999999999e-06, "loss": 0.299, "num_tokens": 45080165.0, "reward": 3.480499744415283, "reward_std": 0.8933743834495544, "rewards/reward_fn/mean": 3.480499744415283, "rewards/reward_fn/std": 0.8933743834495544, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 227.75, "completions/mean_terminated_length": 227.75, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.10300201548742972, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.027561142342165112, "learning_rate": 7.612e-06, "loss": -0.0759, "num_tokens": 45130973.0, "reward": 3.857459783554077, "reward_std": 0.33770760893821716, "rewards/reward_fn/mean": 3.857459783554077, "rewards/reward_fn/std": 0.3377075791358948, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 299.78125, "completions/mean_terminated_length": 299.78125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.10310809377320462, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.02726740762591362, "learning_rate": 7.6116e-06, "loss": -0.0207, "num_tokens": 45169302.0, "reward": 2.7473816871643066, "reward_std": 0.1762368530035019, "rewards/reward_fn/mean": 2.7473816871643066, "rewards/reward_fn/std": 0.1762368530035019, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 101.09375, "completions/mean_terminated_length": 101.09375, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.10321417205897952, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.033008648082613945, "learning_rate": 7.6112e-06, "loss": 0.0013, "num_tokens": 45197241.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 285.46875, "completions/mean_terminated_length": 285.46875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.10332025034475444, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.02396201086230576, "learning_rate": 7.6108e-06, "loss": 0.0925, "num_tokens": 45240744.0, "reward": 1.7027685642242432, "reward_std": 0.03398967534303665, "rewards/reward_fn/mean": 1.7027685642242432, "rewards/reward_fn/std": 0.03398967534303665, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 194.375, "completions/mean_terminated_length": 194.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.10342632863052934, "frac_reward_zero_std": 1.0, "grad_norm": 0.1044921875, "kl": 0.031388872768729925, "learning_rate": 7.6104e-06, "loss": 0.0013, "num_tokens": 45280020.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 226.59375, "completions/mean_terminated_length": 226.59375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.10353240691630423, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.03019576147198677, "learning_rate": 7.61e-06, "loss": -0.0612, "num_tokens": 45328007.0, "reward": 3.906989336013794, "reward_std": 0.30038613080978394, "rewards/reward_fn/mean": 3.906989336013794, "rewards/reward_fn/std": 0.3003861606121063, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 299.59375, "completions/mean_terminated_length": 299.59375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.10363848520207913, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.024021990364417434, "learning_rate": 7.6096e-06, "loss": -0.0009, "num_tokens": 45371802.0, "reward": 2.7480766773223877, "reward_std": 0.044364336878061295, "rewards/reward_fn/mean": 2.7480766773223877, "rewards/reward_fn/std": 0.0443643182516098, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 388.46875, "completions/mean_terminated_length": 334.93548583984375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.10374456348785403, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.021239728201180696, "learning_rate": 7.6092e-06, "loss": 0.2854, "num_tokens": 45415017.0, "reward": 3.8120007514953613, "reward_std": 0.7408618330955505, "rewards/reward_fn/mean": 3.8120007514953613, "rewards/reward_fn/std": 0.7408618330955505, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 197.78125, "completions/mean_terminated_length": 197.78125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.10385064177362893, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.02474969206377864, "learning_rate": 7.608799999999999e-06, "loss": 0.001, "num_tokens": 45455330.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1123.0, "completions/max_terminated_length": 1123.0, "completions/mean_length": 413.9375, "completions/mean_terminated_length": 413.9375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.10395672005940385, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.01774134172592312, "learning_rate": 7.608399999999999e-06, "loss": 0.0342, "num_tokens": 45504768.0, "reward": 2.7947754859924316, "reward_std": 0.048552006483078, "rewards/reward_fn/mean": 2.7947754859924316, "rewards/reward_fn/std": 0.048552028834819794, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 203.03125, "completions/mean_terminated_length": 203.03125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.10406279834517874, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.025733540067449212, "learning_rate": 7.607999999999999e-06, "loss": 0.001, "num_tokens": 45550593.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 222.5625, "completions/mean_terminated_length": 222.5625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.10416887663095364, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.023094360250979662, "learning_rate": 7.607599999999999e-06, "loss": 0.0127, "num_tokens": 45598131.0, "reward": 2.8349218368530273, "reward_std": 0.058446187525987625, "rewards/reward_fn/mean": 2.8349218368530273, "rewards/reward_fn/std": 0.05844619497656822, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 378.90625, "completions/mean_terminated_length": 378.90625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.10427495491672854, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.028874794021248817, "learning_rate": 7.6072e-06, "loss": -0.044, "num_tokens": 45648208.0, "reward": 3.8350107669830322, "reward_std": 0.39003050327301025, "rewards/reward_fn/mean": 3.8350107669830322, "rewards/reward_fn/std": 0.39003047347068787, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 232.03125, "completions/mean_terminated_length": 232.03125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.10438103320250344, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.0243125488050282, "learning_rate": 7.6068e-06, "loss": 0.0304, "num_tokens": 45700241.0, "reward": 3.8992538452148438, "reward_std": 0.3183940649032593, "rewards/reward_fn/mean": 3.8992538452148438, "rewards/reward_fn/std": 0.3183940649032593, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 215.53125, "completions/mean_terminated_length": 215.53125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.10448711148827836, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.05102335708215833, "learning_rate": 7.6064e-06, "loss": -0.0175, "num_tokens": 45759362.0, "reward": 3.928581476211548, "reward_std": 0.40400430560112, "rewards/reward_fn/mean": 3.928581476211548, "rewards/reward_fn/std": 0.40400430560112, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 509.96875, "completions/mean_terminated_length": 509.96875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.10459318977405326, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.018654248444363475, "learning_rate": 7.606e-06, "loss": -0.0156, "num_tokens": 45806657.0, "reward": 3.7861359119415283, "reward_std": 0.6757104992866516, "rewards/reward_fn/mean": 3.7861359119415283, "rewards/reward_fn/std": 0.6757104396820068, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 225.625, "completions/mean_terminated_length": 225.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.10469926805982815, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.019475288689136505, "learning_rate": 7.6056e-06, "loss": -0.0837, "num_tokens": 45853589.0, "reward": 2.9933929443359375, "reward_std": 0.440521240234375, "rewards/reward_fn/mean": 2.9933929443359375, "rewards/reward_fn/std": 0.440521240234375, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 220.96875, "completions/mean_terminated_length": 220.96875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.10480534634560305, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.02420557360164821, "learning_rate": 7.6052e-06, "loss": 0.0416, "num_tokens": 45912820.0, "reward": 3.9028656482696533, "reward_std": 0.3068977892398834, "rewards/reward_fn/mean": 3.9028656482696533, "rewards/reward_fn/std": 0.3068977892398834, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 293.875, "completions/mean_terminated_length": 293.875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.10491142463137795, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.0265792990103364, "learning_rate": 7.6048e-06, "loss": 0.0531, "num_tokens": 45961456.0, "reward": 3.8468708992004395, "reward_std": 0.41194701194763184, "rewards/reward_fn/mean": 3.8468708992004395, "rewards/reward_fn/std": 0.41194698214530945, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 284.53125, "completions/mean_terminated_length": 284.53125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.10501750291715285, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.014853294123895466, "learning_rate": 7.6043999999999996e-06, "loss": -0.0016, "num_tokens": 46004897.0, "reward": 3.556659460067749, "reward_std": 0.4804559051990509, "rewards/reward_fn/mean": 3.556659460067749, "rewards/reward_fn/std": 0.4804559350013733, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 148.71875, "completions/mean_terminated_length": 148.71875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.10512358120292777, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.01824998517986387, "learning_rate": 7.6039999999999995e-06, "loss": 0.1187, "num_tokens": 46042200.0, "reward": 3.964564561843872, "reward_std": 0.2004532665014267, "rewards/reward_fn/mean": 3.964564561843872, "rewards/reward_fn/std": 0.2004532814025879, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.10522965948870266, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.020736161852255464, "learning_rate": 7.6035999999999995e-06, "loss": -0.0034, "num_tokens": 46085484.0, "reward": 3.8878173828125, "reward_std": 0.3545609414577484, "rewards/reward_fn/mean": 3.8878173828125, "rewards/reward_fn/std": 0.3545609414577484, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 276.5625, "completions/mean_terminated_length": 276.5625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.10533573777447756, "frac_reward_zero_std": 1.0, "grad_norm": 0.07373046875, "kl": 0.019082832615822554, "learning_rate": 7.6031999999999995e-06, "loss": 0.0008, "num_tokens": 46130718.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 79.4375, "completions/mean_terminated_length": 79.4375, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.10544181606025246, "frac_reward_zero_std": 0.0, "grad_norm": 3.03125, "kl": 0.02028225746471435, "learning_rate": 7.6028e-06, "loss": 0.0617, "num_tokens": 46156524.0, "reward": 3.9314725399017334, "reward_std": 0.3876495361328125, "rewards/reward_fn/mean": 3.9314725399017334, "rewards/reward_fn/std": 0.3876495659351349, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 261.5625, "completions/mean_terminated_length": 261.5625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.10554789434602736, "frac_reward_zero_std": 1.0, "grad_norm": 0.09326171875, "kl": 0.02337950048968196, "learning_rate": 7.6024e-06, "loss": 0.0009, "num_tokens": 46201086.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 177.34375, "completions/mean_terminated_length": 177.34375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.10565397263180228, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.023212826810777187, "learning_rate": 7.602e-06, "loss": 0.0009, "num_tokens": 46232425.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 310.28125, "completions/mean_terminated_length": 310.28125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.10576005091757718, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.028905097395181656, "learning_rate": 7.6016e-06, "loss": 0.0946, "num_tokens": 46271346.0, "reward": 2.8112916946411133, "reward_std": 1.1012606620788574, "rewards/reward_fn/mean": 2.8112916946411133, "rewards/reward_fn/std": 1.1012605428695679, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 164.28125, "completions/mean_terminated_length": 164.28125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.10586612920335207, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.023424757411703467, "learning_rate": 7.6012e-06, "loss": 0.047, "num_tokens": 46305499.0, "reward": 3.071727991104126, "reward_std": 0.04227209836244583, "rewards/reward_fn/mean": 3.071727991104126, "rewards/reward_fn/std": 0.042272068560123444, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 215.46875, "completions/mean_terminated_length": 215.46875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.10597220748912697, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.020644933450967073, "learning_rate": 7.600799999999999e-06, "loss": 0.0149, "num_tokens": 46356042.0, "reward": 3.8897316455841064, "reward_std": 0.4581899642944336, "rewards/reward_fn/mean": 3.8897316455841064, "rewards/reward_fn/std": 0.458189994096756, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 102.78125, "completions/mean_terminated_length": 102.78125, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.10607828577490187, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.016965405957307667, "learning_rate": 7.600399999999999e-06, "loss": 0.0007, "num_tokens": 46400163.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 167.6875, "completions/mean_terminated_length": 167.6875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.10618436406067679, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474609375, "kl": 0.02575285453349352, "learning_rate": 7.599999999999999e-06, "loss": 0.001, "num_tokens": 46440729.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 316.84375, "completions/mean_terminated_length": 316.84375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.10629044234645169, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.0235441483091563, "learning_rate": 7.599599999999999e-06, "loss": 0.0518, "num_tokens": 46485972.0, "reward": 2.7349283695220947, "reward_std": 0.3006006181240082, "rewards/reward_fn/mean": 2.7349283695220947, "rewards/reward_fn/std": 0.3006005883216858, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 311.46875, "completions/mean_terminated_length": 311.46875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.10639652063222658, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.019770273473113775, "learning_rate": 7.599199999999999e-06, "loss": 0.0655, "num_tokens": 46531491.0, "reward": 3.046574831008911, "reward_std": 0.034899428486824036, "rewards/reward_fn/mean": 3.046574831008911, "rewards/reward_fn/std": 0.034899454563856125, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1233.0, "completions/max_terminated_length": 1233.0, "completions/mean_length": 219.84375, "completions/mean_terminated_length": 219.84375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.10650259891800148, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.022727035451680422, "learning_rate": 7.598799999999999e-06, "loss": 0.26, "num_tokens": 46583710.0, "reward": 2.8298349380493164, "reward_std": 0.035851918160915375, "rewards/reward_fn/mean": 2.8298349380493164, "rewards/reward_fn/std": 0.03585192188620567, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1732.0, "completions/max_terminated_length": 1732.0, "completions/mean_length": 469.0625, "completions/mean_terminated_length": 469.0625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.10660867720377638, "frac_reward_zero_std": 0.0, "grad_norm": 1.1328125, "kl": 0.017710814368911088, "learning_rate": 7.598399999999999e-06, "loss": -0.0522, "num_tokens": 46648288.0, "reward": 3.8750860691070557, "reward_std": 0.3370124101638794, "rewards/reward_fn/mean": 3.8750860691070557, "rewards/reward_fn/std": 0.337012380361557, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2024.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 510.9375, "completions/mean_terminated_length": 510.9375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.10671475548955128, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.018817924661561847, "learning_rate": 7.598e-06, "loss": 0.0941, "num_tokens": 46701182.0, "reward": 3.7116811275482178, "reward_std": 0.6552125811576843, "rewards/reward_fn/mean": 3.7116811275482178, "rewards/reward_fn/std": 0.6552125215530396, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1690.0, "completions/max_terminated_length": 1690.0, "completions/mean_length": 590.78125, "completions/mean_terminated_length": 590.78125, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.1068208337753262, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.020602114964276552, "learning_rate": 7.5976e-06, "loss": -0.0235, "num_tokens": 46758519.0, "reward": 2.503760814666748, "reward_std": 0.6346949338912964, "rewards/reward_fn/mean": 2.503760814666748, "rewards/reward_fn/std": 0.6346949338912964, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 277.1875, "completions/mean_terminated_length": 277.1875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.1069269120611011, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.018794673145748675, "learning_rate": 7.5972e-06, "loss": 0.0631, "num_tokens": 46811837.0, "reward": 2.8126931190490723, "reward_std": 0.046026017516851425, "rewards/reward_fn/mean": 2.8126931190490723, "rewards/reward_fn/std": 0.04602604731917381, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 370.4375, "completions/mean_terminated_length": 316.32257080078125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.107032990346876, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.02782504353672266, "learning_rate": 7.5968e-06, "loss": 0.3013, "num_tokens": 46857707.0, "reward": 3.1419944763183594, "reward_std": 0.6233690977096558, "rewards/reward_fn/mean": 3.1419944763183594, "rewards/reward_fn/std": 0.6233690977096558, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 101.96875, "completions/mean_terminated_length": 101.96875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.1071390686326509, "frac_reward_zero_std": 1.0, "grad_norm": 0.201171875, "kl": 0.022366830613464117, "learning_rate": 7.5964e-06, "loss": 0.0009, "num_tokens": 46896874.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 425.25, "completions/mean_terminated_length": 425.25, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.1072451469184258, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.025819960748776793, "learning_rate": 7.596e-06, "loss": 0.0277, "num_tokens": 46952274.0, "reward": 2.5166473388671875, "reward_std": 0.42934849858283997, "rewards/reward_fn/mean": 2.5166473388671875, "rewards/reward_fn/std": 0.42934852838516235, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 967.0, "completions/mean_terminated_length": 894.933349609375, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.1073512252042007, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.01642954268027097, "learning_rate": 7.5956e-06, "loss": 0.1213, "num_tokens": 47010386.0, "reward": 2.331674337387085, "reward_std": 0.7580024003982544, "rewards/reward_fn/mean": 2.331674337387085, "rewards/reward_fn/std": 0.7580023407936096, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 177.8125, "completions/mean_terminated_length": 177.8125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.1074573034899756, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.021480249939486384, "learning_rate": 7.5952e-06, "loss": 0.0009, "num_tokens": 47050796.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 278.1875, "completions/mean_terminated_length": 278.1875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.1075633817757505, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.028301968472078443, "learning_rate": 7.5948e-06, "loss": 0.0219, "num_tokens": 47100498.0, "reward": 3.1440305709838867, "reward_std": 0.5901278853416443, "rewards/reward_fn/mean": 3.1440305709838867, "rewards/reward_fn/std": 0.5901278257369995, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 220.96875, "completions/mean_terminated_length": 220.96875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.1076694600615254, "frac_reward_zero_std": 1.0, "grad_norm": 0.10595703125, "kl": 0.025707697961479425, "learning_rate": 7.5944e-06, "loss": 0.001, "num_tokens": 47161393.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 632.0625, "completions/mean_terminated_length": 586.3870849609375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.1077755383473003, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.02113516186363995, "learning_rate": 7.594e-06, "loss": 0.2314, "num_tokens": 47223315.0, "reward": 2.526656150817871, "reward_std": 0.6006430983543396, "rewards/reward_fn/mean": 2.526656150817871, "rewards/reward_fn/std": 0.6006431579589844, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1488.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 676.65625, "completions/mean_terminated_length": 676.65625, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 0.1078816166330752, "frac_reward_zero_std": 0.0, "grad_norm": 1.0703125, "kl": 0.01965883933007717, "learning_rate": 7.5936e-06, "loss": -0.0026, "num_tokens": 47279528.0, "reward": 2.664598226547241, "reward_std": 0.19238144159317017, "rewards/reward_fn/mean": 2.664598226547241, "rewards/reward_fn/std": 0.19238145649433136, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 166.8125, "completions/mean_terminated_length": 166.8125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.10798769491885012, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.02208179165609181, "learning_rate": 7.5932e-06, "loss": 0.0009, "num_tokens": 47334466.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 287.625, "completions/mean_terminated_length": 287.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.10809377320462502, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.030248204711824656, "learning_rate": 7.5928e-06, "loss": 0.1147, "num_tokens": 47382646.0, "reward": 3.9577441215515137, "reward_std": 0.23903484642505646, "rewards/reward_fn/mean": 3.9577441215515137, "rewards/reward_fn/std": 0.23903487622737885, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1610.0, "completions/max_terminated_length": 1610.0, "completions/mean_length": 223.875, "completions/mean_terminated_length": 223.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.10819985149039991, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.02753225597552955, "learning_rate": 7.5923999999999995e-06, "loss": -0.1361, "num_tokens": 47424050.0, "reward": 3.9119248390197754, "reward_std": 0.27822986245155334, "rewards/reward_fn/mean": 3.9119248390197754, "rewards/reward_fn/std": 0.27822983264923096, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 445.1875, "completions/mean_terminated_length": 393.4838562011719, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.10830592977617481, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.025169007247313857, "learning_rate": 7.5919999999999995e-06, "loss": 0.2699, "num_tokens": 47475288.0, "reward": 3.875, "reward_std": 0.7071067690849304, "rewards/reward_fn/mean": 3.875, "rewards/reward_fn/std": 0.7071067690849304, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 432.5, "completions/mean_terminated_length": 432.5, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.10841200806194971, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.020216177916154265, "learning_rate": 7.5915999999999994e-06, "loss": 0.0523, "num_tokens": 47528808.0, "reward": 2.7419400215148926, "reward_std": 0.19625967741012573, "rewards/reward_fn/mean": 2.7419400215148926, "rewards/reward_fn/std": 0.19625964760780334, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 570.21875, "completions/mean_terminated_length": 570.21875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.10851808634772463, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.024952333886176348, "learning_rate": 7.591199999999999e-06, "loss": 0.0449, "num_tokens": 47584559.0, "reward": 2.8316707611083984, "reward_std": 0.03383928909897804, "rewards/reward_fn/mean": 2.8316707611083984, "rewards/reward_fn/std": 0.033839285373687744, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 245.625, "completions/mean_terminated_length": 245.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.10862416463349953, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.03429146436974406, "learning_rate": 7.590799999999999e-06, "loss": 0.0576, "num_tokens": 47635491.0, "reward": 3.8389077186584473, "reward_std": 0.38074901700019836, "rewards/reward_fn/mean": 3.8389077186584473, "rewards/reward_fn/std": 0.38074901700019836, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 473.84375, "completions/mean_terminated_length": 473.84375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.10873024291927443, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.022358035668730736, "learning_rate": 7.590399999999999e-06, "loss": 0.04, "num_tokens": 47704702.0, "reward": 2.8870303630828857, "reward_std": 0.09213743358850479, "rewards/reward_fn/mean": 2.8870303630828857, "rewards/reward_fn/std": 0.09213750809431076, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 356.9375, "completions/mean_terminated_length": 356.9375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.10883632120504932, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.02047534054145217, "learning_rate": 7.589999999999999e-06, "loss": 0.1106, "num_tokens": 47758972.0, "reward": 3.618471145629883, "reward_std": 0.714113712310791, "rewards/reward_fn/mean": 3.618471145629883, "rewards/reward_fn/std": 0.714113712310791, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 190.65625, "completions/mean_terminated_length": 190.65625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.10894239949082422, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.05442382441833615, "learning_rate": 7.589599999999999e-06, "loss": 0.0022, "num_tokens": 47801393.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 286.5, "completions/mean_terminated_length": 286.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.10904847777659914, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.022991040954366326, "learning_rate": 7.589199999999999e-06, "loss": 0.0042, "num_tokens": 47844385.0, "reward": 2.956698179244995, "reward_std": 0.3429322838783264, "rewards/reward_fn/mean": 2.956698179244995, "rewards/reward_fn/std": 0.34293225407600403, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 121.875, "completions/mean_terminated_length": 121.875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.10915455606237404, "frac_reward_zero_std": 0.0, "grad_norm": 3.171875, "kl": 0.030354263726621866, "learning_rate": 7.588799999999999e-06, "loss": 0.1608, "num_tokens": 47876573.0, "reward": 3.9108946323394775, "reward_std": 0.2824559509754181, "rewards/reward_fn/mean": 3.9108946323394775, "rewards/reward_fn/std": 0.2824559211730957, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 178.28125, "completions/mean_terminated_length": 178.28125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.10926063434814894, "frac_reward_zero_std": 0.0, "grad_norm": 2.265625, "kl": 0.02428922988474369, "learning_rate": 7.5884e-06, "loss": 0.0219, "num_tokens": 47910662.0, "reward": 3.896930456161499, "reward_std": 0.3276534676551819, "rewards/reward_fn/mean": 3.896930456161499, "rewards/reward_fn/std": 0.3276534676551819, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2019.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 508.96875, "completions/mean_terminated_length": 508.96875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.10936671263392383, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.02369822352193296, "learning_rate": 7.588e-06, "loss": 0.0164, "num_tokens": 47948101.0, "reward": 3.1501495838165283, "reward_std": 0.7063373923301697, "rewards/reward_fn/mean": 3.1501495838165283, "rewards/reward_fn/std": 0.7063372731208801, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 404.75, "completions/mean_terminated_length": 351.7419128417969, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.10947279091969873, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.025314223021268845, "learning_rate": 7.5876e-06, "loss": 0.2811, "num_tokens": 47982717.0, "reward": 3.2338669300079346, "reward_std": 0.8930133581161499, "rewards/reward_fn/mean": 3.2338669300079346, "rewards/reward_fn/std": 0.8930133581161499, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 240.5625, "completions/mean_terminated_length": 240.5625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.10957886920547363, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.023625839967280626, "learning_rate": 7.5872e-06, "loss": -0.0313, "num_tokens": 48036559.0, "reward": 3.2713961601257324, "reward_std": 0.5741091370582581, "rewards/reward_fn/mean": 3.2713961601257324, "rewards/reward_fn/std": 0.5741091966629028, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 434.53125, "completions/mean_terminated_length": 434.53125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.10968494749124855, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.021874976810067892, "learning_rate": 7.5868e-06, "loss": -0.033, "num_tokens": 48087968.0, "reward": 2.664581298828125, "reward_std": 0.03544781729578972, "rewards/reward_fn/mean": 2.664581298828125, "rewards/reward_fn/std": 0.03544781729578972, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 165.59375, "completions/mean_terminated_length": 165.59375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.10979102577702345, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.024668074445798993, "learning_rate": 7.5864e-06, "loss": -0.0904, "num_tokens": 48128627.0, "reward": 2.9213130474090576, "reward_std": 0.2021249383687973, "rewards/reward_fn/mean": 2.9213130474090576, "rewards/reward_fn/std": 0.2021249383687973, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 583.875, "completions/mean_terminated_length": 583.875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.10989710406279835, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.017253244761377573, "learning_rate": 7.586e-06, "loss": -0.0018, "num_tokens": 48182063.0, "reward": 3.2175590991973877, "reward_std": 0.9283421635627747, "rewards/reward_fn/mean": 3.2175590991973877, "rewards/reward_fn/std": 0.9283421635627747, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.11000318234857324, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.022420917404815555, "learning_rate": 7.5856e-06, "loss": 0.0009, "num_tokens": 48225567.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 394.40625, "completions/mean_terminated_length": 394.40625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.11010926063434814, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.01980700553394854, "learning_rate": 7.5852e-06, "loss": 0.0774, "num_tokens": 48293036.0, "reward": 2.6475255489349365, "reward_std": 0.2643485963344574, "rewards/reward_fn/mean": 2.6475255489349365, "rewards/reward_fn/std": 0.2643485963344574, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 287.40625, "completions/mean_terminated_length": 287.40625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.11021533892012306, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.026065111625939608, "learning_rate": 7.5848e-06, "loss": 0.0832, "num_tokens": 48346649.0, "reward": 3.965364456176758, "reward_std": 0.19592823088169098, "rewards/reward_fn/mean": 3.965364456176758, "rewards/reward_fn/std": 0.19592821598052979, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 216.0, "completions/max_terminated_length": 216.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.11032141720589796, "frac_reward_zero_std": 1.0, "grad_norm": 0.12109375, "kl": 0.021192287211306393, "learning_rate": 7.584399999999999e-06, "loss": 0.0008, "num_tokens": 48383221.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 174.34375, "completions/mean_terminated_length": 174.34375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.11042749549167286, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.021020429907366633, "learning_rate": 7.583999999999999e-06, "loss": 0.0779, "num_tokens": 48431456.0, "reward": 3.859943151473999, "reward_std": 0.3045395612716675, "rewards/reward_fn/mean": 3.859943151473999, "rewards/reward_fn/std": 0.30453959107398987, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 286.65625, "completions/mean_terminated_length": 286.65625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.11053357377744776, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.029660561122000217, "learning_rate": 7.5836e-06, "loss": 0.1015, "num_tokens": 48497877.0, "reward": 3.7863168716430664, "reward_std": 0.6171298027038574, "rewards/reward_fn/mean": 3.7863168716430664, "rewards/reward_fn/std": 0.6171298027038574, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 205.34375, "completions/mean_terminated_length": 205.34375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.11063965206322265, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.033476054668426514, "learning_rate": 7.5832e-06, "loss": -0.0209, "num_tokens": 48536768.0, "reward": 3.891145706176758, "reward_std": 0.3448222577571869, "rewards/reward_fn/mean": 3.891145706176758, "rewards/reward_fn/std": 0.3448222279548645, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1634.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 546.5625, "completions/mean_terminated_length": 546.5625, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.11074573034899755, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.019996803253889084, "learning_rate": 7.5828e-06, "loss": -0.0001, "num_tokens": 48590354.0, "reward": 2.7087881565093994, "reward_std": 0.3276941478252411, "rewards/reward_fn/mean": 2.7087881565093994, "rewards/reward_fn/std": 0.3276940882205963, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1202.0, "completions/max_terminated_length": 1202.0, "completions/mean_length": 487.53125, "completions/mean_terminated_length": 487.53125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.11085180863477247, "frac_reward_zero_std": 1.0, "grad_norm": 0.057373046875, "kl": 0.02068508369848132, "learning_rate": 7.5824e-06, "loss": 0.0008, "num_tokens": 48642499.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 200.09375, "completions/mean_terminated_length": 200.09375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.11095788692054737, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.043467214331030846, "learning_rate": 7.5819999999999996e-06, "loss": -0.021, "num_tokens": 48691526.0, "reward": 3.014760971069336, "reward_std": 0.32387152314186096, "rewards/reward_fn/mean": 3.014760971069336, "rewards/reward_fn/std": 0.32387155294418335, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 147.09375, "completions/mean_terminated_length": 147.09375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.11106396520632227, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.015362325706519186, "learning_rate": 7.5815999999999995e-06, "loss": 0.0156, "num_tokens": 48732105.0, "reward": 3.898536205291748, "reward_std": 0.4298121929168701, "rewards/reward_fn/mean": 3.898536205291748, "rewards/reward_fn/std": 0.42981216311454773, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 256.1875, "completions/mean_terminated_length": 256.1875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.11117004349209716, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.02533377055078745, "learning_rate": 7.5811999999999995e-06, "loss": 0.0145, "num_tokens": 48787151.0, "reward": 1.7609915733337402, "reward_std": 0.20412206649780273, "rewards/reward_fn/mean": 1.7609915733337402, "rewards/reward_fn/std": 0.20412209630012512, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 237.09375, "completions/mean_terminated_length": 237.09375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.11127612177787206, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.02054914110340178, "learning_rate": 7.5807999999999995e-06, "loss": 0.0838, "num_tokens": 48841618.0, "reward": 3.9238462448120117, "reward_std": 0.2997784912586212, "rewards/reward_fn/mean": 3.9238462448120117, "rewards/reward_fn/std": 0.29977843165397644, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1064.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 245.75, "completions/mean_terminated_length": 245.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.11138220006364698, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.021705527789890766, "learning_rate": 7.5803999999999995e-06, "loss": 0.0682, "num_tokens": 48871338.0, "reward": 3.700695037841797, "reward_std": 0.5276238918304443, "rewards/reward_fn/mean": 3.700695037841797, "rewards/reward_fn/std": 0.5276238918304443, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 485.65625, "completions/mean_terminated_length": 485.65625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.11148827834942188, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.018874148838222027, "learning_rate": 7.5799999999999994e-06, "loss": 0.0752, "num_tokens": 48906847.0, "reward": 3.036792278289795, "reward_std": 0.6571252346038818, "rewards/reward_fn/mean": 3.036792278289795, "rewards/reward_fn/std": 0.6571252942085266, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 537.78125, "completions/mean_terminated_length": 489.06451416015625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.11159435663519678, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.02108449419029057, "learning_rate": 7.579599999999999e-06, "loss": 0.2119, "num_tokens": 48944184.0, "reward": 2.643126964569092, "reward_std": 0.8047422170639038, "rewards/reward_fn/mean": 2.643126964569092, "rewards/reward_fn/std": 0.8047422170639038, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 157.0, "completions/max_terminated_length": 157.0, "completions/mean_length": 112.3125, "completions/mean_terminated_length": 112.3125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.11170043492097168, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.02645600028336048, "learning_rate": 7.5792e-06, "loss": 0.0011, "num_tokens": 48984898.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 215.65625, "completions/mean_terminated_length": 215.65625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.11180651320674657, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.020799231133423746, "learning_rate": 7.5788e-06, "loss": 0.0008, "num_tokens": 49017527.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 148.71875, "completions/mean_terminated_length": 148.71875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.11191259149252149, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.02697490993887186, "learning_rate": 7.5784e-06, "loss": 0.0291, "num_tokens": 49061166.0, "reward": 3.6497604846954346, "reward_std": 0.5690338611602783, "rewards/reward_fn/mean": 3.6497604846954346, "rewards/reward_fn/std": 0.5690338611602783, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1170.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 248.40625, "completions/mean_terminated_length": 248.40625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.11201866977829639, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.028018145821988583, "learning_rate": 7.578e-06, "loss": 0.187, "num_tokens": 49101179.0, "reward": 3.8402042388916016, "reward_std": 0.4300036132335663, "rewards/reward_fn/mean": 3.8402042388916016, "rewards/reward_fn/std": 0.4300036132335663, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 121.96875, "completions/mean_terminated_length": 121.96875, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.11212474806407129, "frac_reward_zero_std": 0.0, "grad_norm": 3.0, "kl": 0.02238781377673149, "learning_rate": 7.5776e-06, "loss": 0.1305, "num_tokens": 49142554.0, "reward": 3.9166479110717773, "reward_std": 0.32798945903778076, "rewards/reward_fn/mean": 3.9166479110717773, "rewards/reward_fn/std": 0.32798945903778076, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 107.09375, "completions/mean_terminated_length": 107.09375, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.11223082634984619, "frac_reward_zero_std": 1.0, "grad_norm": 0.2294921875, "kl": 0.029718552948907018, "learning_rate": 7.5772e-06, "loss": 0.0012, "num_tokens": 49190333.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 378.3125, "completions/mean_terminated_length": 378.3125, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.11233690463562108, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.020035079680383205, "learning_rate": 7.5768e-06, "loss": -0.0189, "num_tokens": 49247655.0, "reward": 3.7000904083251953, "reward_std": 0.5648115277290344, "rewards/reward_fn/mean": 3.7000904083251953, "rewards/reward_fn/std": 0.5648115277290344, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 112.0, "completions/max_terminated_length": 112.0, "completions/mean_length": 84.5, "completions/mean_terminated_length": 84.5, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.11244298292139598, "frac_reward_zero_std": 1.0, "grad_norm": 0.1669921875, "kl": 0.01883522653952241, "learning_rate": 7.576399999999999e-06, "loss": 0.0008, "num_tokens": 49292759.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 416.625, "completions/mean_terminated_length": 416.625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.1125490612071709, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.0226012347266078, "learning_rate": 7.575999999999999e-06, "loss": 0.0211, "num_tokens": 49321707.0, "reward": 3.0304574966430664, "reward_std": 0.6895338892936707, "rewards/reward_fn/mean": 3.0304574966430664, "rewards/reward_fn/std": 0.6895338296890259, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 97.875, "completions/mean_terminated_length": 97.875, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.1126551394929458, "frac_reward_zero_std": 1.0, "grad_norm": 0.171875, "kl": 0.02689792774617672, "learning_rate": 7.575599999999999e-06, "loss": 0.0011, "num_tokens": 49369447.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1449.0, "completions/max_terminated_length": 1449.0, "completions/mean_length": 288.6875, "completions/mean_terminated_length": 288.6875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.1127612177787207, "frac_reward_zero_std": 1.0, "grad_norm": 0.10107421875, "kl": 0.026896960800513625, "learning_rate": 7.575199999999999e-06, "loss": 0.0011, "num_tokens": 49407837.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 242.8125, "completions/mean_terminated_length": 242.8125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.1128672960644956, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.03066409262828529, "learning_rate": 7.574799999999999e-06, "loss": 0.0108, "num_tokens": 49437655.0, "reward": 3.775486469268799, "reward_std": 0.6500386595726013, "rewards/reward_fn/mean": 3.775486469268799, "rewards/reward_fn/std": 0.6500386595726013, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 416.5, "completions/mean_terminated_length": 416.5, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.1129733743502705, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.021771566243842244, "learning_rate": 7.5744e-06, "loss": -0.0615, "num_tokens": 49483047.0, "reward": 3.53684139251709, "reward_std": 0.9053143858909607, "rewards/reward_fn/mean": 3.53684139251709, "rewards/reward_fn/std": 0.9053143858909607, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 160.03125, "completions/mean_terminated_length": 160.03125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.11307945263604541, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.020757366786710918, "learning_rate": 7.574e-06, "loss": 0.0008, "num_tokens": 49539272.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 171.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.11318553092182031, "frac_reward_zero_std": 1.0, "grad_norm": 0.111328125, "kl": 0.02480534859932959, "learning_rate": 7.5736e-06, "loss": 0.001, "num_tokens": 49575612.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 343.0, "completions/mean_terminated_length": 343.0, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.1132916092075952, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.019847781863063574, "learning_rate": 7.5732e-06, "loss": 0.0209, "num_tokens": 49620540.0, "reward": 2.8053698539733887, "reward_std": 0.044293008744716644, "rewards/reward_fn/mean": 2.8053698539733887, "rewards/reward_fn/std": 0.04429301992058754, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 408.25, "completions/mean_terminated_length": 408.25, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.1133976874933701, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.022560104727745056, "learning_rate": 7.5728e-06, "loss": -0.0794, "num_tokens": 49671780.0, "reward": 2.5189239978790283, "reward_std": 0.7050349712371826, "rewards/reward_fn/mean": 2.5189239978790283, "rewards/reward_fn/std": 0.7050350308418274, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 395.28125, "completions/mean_terminated_length": 395.28125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.113503765779145, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.024412208003923297, "learning_rate": 7.5724e-06, "loss": 0.1008, "num_tokens": 49722989.0, "reward": 3.4259486198425293, "reward_std": 0.41865074634552, "rewards/reward_fn/mean": 3.4259486198425293, "rewards/reward_fn/std": 0.41865074634552, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 241.15625, "completions/mean_terminated_length": 241.15625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.1136098440649199, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.019247191143222153, "learning_rate": 7.572e-06, "loss": 0.0774, "num_tokens": 49777074.0, "reward": 3.9278671741485596, "reward_std": 0.4080452620983124, "rewards/reward_fn/mean": 3.9278671741485596, "rewards/reward_fn/std": 0.4080452620983124, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 486.34375, "completions/mean_terminated_length": 486.34375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.11371592235069482, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.026156109059229493, "learning_rate": 7.5716e-06, "loss": -0.0044, "num_tokens": 49830685.0, "reward": 2.5489959716796875, "reward_std": 0.32066285610198975, "rewards/reward_fn/mean": 2.5489959716796875, "rewards/reward_fn/std": 0.32066285610198975, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 329.53125, "completions/mean_terminated_length": 329.53125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.11382200063646972, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.023007875541225076, "learning_rate": 7.5712e-06, "loss": 0.0351, "num_tokens": 49877518.0, "reward": 3.3481032848358154, "reward_std": 0.6226815581321716, "rewards/reward_fn/mean": 3.3481032848358154, "rewards/reward_fn/std": 0.6226814985275269, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 274.34375, "completions/mean_terminated_length": 274.34375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.11392807892224462, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.020330962724983692, "learning_rate": 7.5708e-06, "loss": 0.0008, "num_tokens": 49918969.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 217.8125, "completions/mean_terminated_length": 217.8125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.11403415720801952, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.01607495010830462, "learning_rate": 7.5703999999999995e-06, "loss": 0.0006, "num_tokens": 49970675.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 83.125, "completions/mean_terminated_length": 83.125, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.11414023549379441, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.018819114891812205, "learning_rate": 7.5699999999999995e-06, "loss": 0.0008, "num_tokens": 50019447.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 213.09375, "completions/mean_terminated_length": 213.09375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.11424631377956933, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.024625253630802035, "learning_rate": 7.5696e-06, "loss": -0.1125, "num_tokens": 50067546.0, "reward": 3.350964307785034, "reward_std": 1.0544812679290771, "rewards/reward_fn/mean": 3.350964307785034, "rewards/reward_fn/std": 1.0544813871383667, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1743.0, "completions/max_terminated_length": 1743.0, "completions/mean_length": 319.34375, "completions/mean_terminated_length": 319.34375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.11435239206534423, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.020671964855864644, "learning_rate": 7.5692e-06, "loss": 0.2282, "num_tokens": 50111141.0, "reward": 3.702298402786255, "reward_std": 0.8400141596794128, "rewards/reward_fn/mean": 3.702298402786255, "rewards/reward_fn/std": 0.8400141596794128, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 318.1875, "completions/mean_terminated_length": 318.1875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.11445847035111913, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.01740014727693051, "learning_rate": 7.5688e-06, "loss": 0.0787, "num_tokens": 50157419.0, "reward": 3.0187315940856934, "reward_std": 0.6965243816375732, "rewards/reward_fn/mean": 3.0187315940856934, "rewards/reward_fn/std": 0.6965243816375732, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 284.3125, "completions/mean_terminated_length": 284.3125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.11456454863689403, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.024546197149902582, "learning_rate": 7.568399999999999e-06, "loss": -0.0078, "num_tokens": 50187157.0, "reward": 3.7409169673919678, "reward_std": 0.4564998149871826, "rewards/reward_fn/mean": 3.7409169673919678, "rewards/reward_fn/std": 0.45649975538253784, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 212.3125, "completions/mean_terminated_length": 212.3125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.11467062692266893, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.02775608957745135, "learning_rate": 7.567999999999999e-06, "loss": 0.0725, "num_tokens": 50225087.0, "reward": 3.6997857093811035, "reward_std": 0.4594448208808899, "rewards/reward_fn/mean": 3.6997857093811035, "rewards/reward_fn/std": 0.4594447910785675, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1101.0, "completions/max_terminated_length": 1101.0, "completions/mean_length": 295.6875, "completions/mean_terminated_length": 295.6875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.11477670520844384, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.026229602517560124, "learning_rate": 7.567599999999999e-06, "loss": 0.0526, "num_tokens": 50275957.0, "reward": 3.969325542449951, "reward_std": 0.17352108657360077, "rewards/reward_fn/mean": 3.969325542449951, "rewards/reward_fn/std": 0.17352110147476196, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 244.90625, "completions/mean_terminated_length": 244.90625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.11488278349421874, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.020248351618647575, "learning_rate": 7.567199999999999e-06, "loss": 0.0566, "num_tokens": 50320498.0, "reward": 2.9547595977783203, "reward_std": 0.5952463746070862, "rewards/reward_fn/mean": 2.9547595977783203, "rewards/reward_fn/std": 0.5952463746070862, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 227.5, "completions/mean_terminated_length": 227.5, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.11498886177999364, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.022126831114292145, "learning_rate": 7.566799999999999e-06, "loss": 0.0513, "num_tokens": 50361634.0, "reward": 3.7756552696228027, "reward_std": 0.6386120915412903, "rewards/reward_fn/mean": 3.7756552696228027, "rewards/reward_fn/std": 0.6386121511459351, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 222.34375, "completions/mean_terminated_length": 222.34375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.11509494006576854, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.019203853677026927, "learning_rate": 7.566399999999999e-06, "loss": 0.0008, "num_tokens": 50396141.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 99.0, "completions/max_terminated_length": 99.0, "completions/mean_length": 71.0, "completions/mean_terminated_length": 71.0, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.11520101835154344, "frac_reward_zero_std": 0.0, "grad_norm": 3.140625, "kl": 0.023645572364330292, "learning_rate": 7.565999999999999e-06, "loss": 0.005, "num_tokens": 50420653.0, "reward": 3.9301629066467285, "reward_std": 0.3950580954551697, "rewards/reward_fn/mean": 3.9301629066467285, "rewards/reward_fn/std": 0.3950580954551697, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 125.0, "completions/max_terminated_length": 125.0, "completions/mean_length": 92.21875, "completions/mean_terminated_length": 92.21875, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.11530709663731833, "frac_reward_zero_std": 1.0, "grad_norm": 0.2177734375, "kl": 0.021084198029711843, "learning_rate": 7.565599999999999e-06, "loss": 0.0008, "num_tokens": 50465332.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1061.0, "completions/max_terminated_length": 1061.0, "completions/mean_length": 313.78125, "completions/mean_terminated_length": 313.78125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.11541317492309325, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.02206204435788095, "learning_rate": 7.565199999999999e-06, "loss": 0.0009, "num_tokens": 50512781.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 179.5, "completions/mean_terminated_length": 179.5, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.11551925320886815, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.01822281803470105, "learning_rate": 7.5648e-06, "loss": 0.0007, "num_tokens": 50551677.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 83.59375, "completions/mean_terminated_length": 83.59375, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.11562533149464305, "frac_reward_zero_std": 0.0, "grad_norm": 2.8125, "kl": 0.027714208932593465, "learning_rate": 7.5644e-06, "loss": -0.0259, "num_tokens": 50628176.0, "reward": 3.9305734634399414, "reward_std": 0.39273524284362793, "rewards/reward_fn/mean": 3.9305734634399414, "rewards/reward_fn/std": 0.39273524284362793, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 315.15625, "completions/mean_terminated_length": 315.15625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.11573140978041795, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.02233049925416708, "learning_rate": 7.564e-06, "loss": 0.0933, "num_tokens": 50683957.0, "reward": 3.8804166316986084, "reward_std": 0.3777785003185272, "rewards/reward_fn/mean": 3.8804166316986084, "rewards/reward_fn/std": 0.3777785003185272, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 371.125, "completions/mean_terminated_length": 371.125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.11583748806619285, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.029797785449773073, "learning_rate": 7.5636e-06, "loss": 0.0461, "num_tokens": 50751865.0, "reward": 2.858231544494629, "reward_std": 1.1363381147384644, "rewards/reward_fn/mean": 2.858231544494629, "rewards/reward_fn/std": 1.1363381147384644, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 131.09375, "completions/mean_terminated_length": 131.09375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.11594356635196776, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.021672799484804273, "learning_rate": 7.5632e-06, "loss": 0.0009, "num_tokens": 50799932.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 330.0625, "completions/mean_terminated_length": 330.0625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.11604964463774266, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.017305174726061523, "learning_rate": 7.5628e-06, "loss": 0.0007, "num_tokens": 50857982.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1091.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 243.03125, "completions/mean_terminated_length": 243.03125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.11615572292351756, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.02653159131295979, "learning_rate": 7.5624e-06, "loss": -0.0145, "num_tokens": 50914719.0, "reward": 3.928218126296997, "reward_std": 0.40605998039245605, "rewards/reward_fn/mean": 3.928218126296997, "rewards/reward_fn/std": 0.40606001019477844, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 158.625, "completions/mean_terminated_length": 158.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.11626180120929246, "frac_reward_zero_std": 1.0, "grad_norm": 0.10595703125, "kl": 0.02485039341263473, "learning_rate": 7.562e-06, "loss": 0.001, "num_tokens": 50947411.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1744.0, "completions/max_terminated_length": 1744.0, "completions/mean_length": 693.3125, "completions/mean_terminated_length": 693.3125, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.11636787949506736, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.02324887551367283, "learning_rate": 7.5616e-06, "loss": 0.0876, "num_tokens": 51001437.0, "reward": 2.568819522857666, "reward_std": 0.38974249362945557, "rewards/reward_fn/mean": 2.568819522857666, "rewards/reward_fn/std": 0.3897424340248108, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1142.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 305.78125, "completions/mean_terminated_length": 305.78125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.11647395778084225, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.018533728667534888, "learning_rate": 7.5612e-06, "loss": 0.0793, "num_tokens": 51048822.0, "reward": 3.68546724319458, "reward_std": 0.5669353008270264, "rewards/reward_fn/mean": 3.68546724319458, "rewards/reward_fn/std": 0.5669353008270264, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 78.71875, "completions/mean_terminated_length": 78.71875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.11658003606661717, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455078125, "kl": 0.02107345312833786, "learning_rate": 7.5608e-06, "loss": 0.0008, "num_tokens": 51085869.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 238.3125, "completions/mean_terminated_length": 238.3125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.11668611435239207, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.021306362934410572, "learning_rate": 7.5604e-06, "loss": 0.0548, "num_tokens": 51137207.0, "reward": 2.826180934906006, "reward_std": 0.03045596368610859, "rewards/reward_fn/mean": 2.826180934906006, "rewards/reward_fn/std": 0.03045591339468956, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 244.25, "completions/mean_terminated_length": 244.25, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.11679219263816697, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.022923755925148726, "learning_rate": 7.56e-06, "loss": 0.0168, "num_tokens": 51193855.0, "reward": 3.835073947906494, "reward_std": 0.48954635858535767, "rewards/reward_fn/mean": 3.835073947906494, "rewards/reward_fn/std": 0.48954638838768005, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 208.34375, "completions/mean_terminated_length": 208.34375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.11689827092394187, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.016427406924776733, "learning_rate": 7.5596e-06, "loss": 0.0007, "num_tokens": 51233450.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 275.15625, "completions/mean_terminated_length": 275.15625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.11700434920971677, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.02642908156849444, "learning_rate": 7.5591999999999996e-06, "loss": -0.0509, "num_tokens": 51272687.0, "reward": 2.866469144821167, "reward_std": 0.3955315351486206, "rewards/reward_fn/mean": 2.866469144821167, "rewards/reward_fn/std": 0.395531564950943, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1498.0, "completions/mean_length": 786.0, "completions/mean_terminated_length": 745.290283203125, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.11711042749549168, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.02294124150648713, "learning_rate": 7.5587999999999995e-06, "loss": 0.1011, "num_tokens": 51341903.0, "reward": 2.7701563835144043, "reward_std": 0.355268269777298, "rewards/reward_fn/mean": 2.7701563835144043, "rewards/reward_fn/std": 0.3552682399749756, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 213.4375, "completions/mean_terminated_length": 213.4375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.11721650578126658, "frac_reward_zero_std": 1.0, "grad_norm": 0.06787109375, "kl": 0.017608007532544434, "learning_rate": 7.5583999999999995e-06, "loss": 0.0007, "num_tokens": 51382781.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 365.03125, "completions/mean_terminated_length": 365.03125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.11732258406704148, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.021670927526429296, "learning_rate": 7.5579999999999995e-06, "loss": 0.0421, "num_tokens": 51425758.0, "reward": 2.7978286743164062, "reward_std": 0.512295663356781, "rewards/reward_fn/mean": 2.7978286743164062, "rewards/reward_fn/std": 0.512295663356781, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1947.0, "completions/max_terminated_length": 1947.0, "completions/mean_length": 535.5625, "completions/mean_terminated_length": 535.5625, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.11742866235281638, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.01689821679610759, "learning_rate": 7.5575999999999994e-06, "loss": -0.0038, "num_tokens": 51493776.0, "reward": 2.7099485397338867, "reward_std": 0.6777162551879883, "rewards/reward_fn/mean": 2.7099485397338867, "rewards/reward_fn/std": 0.6777163147926331, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 197.0, "completions/mean_terminated_length": 197.0, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.11753474063859128, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.02550308988429606, "learning_rate": 7.557199999999999e-06, "loss": 0.1913, "num_tokens": 51556496.0, "reward": 3.723670482635498, "reward_std": 0.4863794147968292, "rewards/reward_fn/mean": 3.723670482635498, "rewards/reward_fn/std": 0.4863794147968292, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1831.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 429.0625, "completions/mean_terminated_length": 429.0625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.11764081892436619, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.01923588034696877, "learning_rate": 7.556799999999999e-06, "loss": 0.0173, "num_tokens": 51599986.0, "reward": 2.7359695434570312, "reward_std": 0.044656820595264435, "rewards/reward_fn/mean": 2.7359695434570312, "rewards/reward_fn/std": 0.044656842947006226, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 275.46875, "completions/mean_terminated_length": 275.46875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.11774689721014109, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.018184081302024424, "learning_rate": 7.556399999999999e-06, "loss": 0.0582, "num_tokens": 51651681.0, "reward": 3.522010564804077, "reward_std": 0.5560933351516724, "rewards/reward_fn/mean": 3.522010564804077, "rewards/reward_fn/std": 0.5560933351516724, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 256.96875, "completions/mean_terminated_length": 256.96875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.11785297549591599, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.02644204581156373, "learning_rate": 7.555999999999999e-06, "loss": 0.0319, "num_tokens": 51691008.0, "reward": 3.9324989318847656, "reward_std": 0.26562514901161194, "rewards/reward_fn/mean": 3.9324989318847656, "rewards/reward_fn/std": 0.26562511920928955, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 358.5, "completions/mean_terminated_length": 358.5, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.11795905378169089, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.020751751959323883, "learning_rate": 7.5556e-06, "loss": -0.0171, "num_tokens": 51747120.0, "reward": 2.6532459259033203, "reward_std": 0.33404526114463806, "rewards/reward_fn/mean": 2.6532459259033203, "rewards/reward_fn/std": 0.33404526114463806, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 447.53125, "completions/mean_terminated_length": 447.53125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.11806513206746579, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.025134951109066606, "learning_rate": 7.5552e-06, "loss": 0.1821, "num_tokens": 51800769.0, "reward": 3.224299192428589, "reward_std": 0.5572682619094849, "rewards/reward_fn/mean": 3.224299192428589, "rewards/reward_fn/std": 0.5572682619094849, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 395.875, "completions/mean_terminated_length": 395.875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.11817121035324069, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.02163501945324242, "learning_rate": 7.5548e-06, "loss": 0.0656, "num_tokens": 51843965.0, "reward": 3.863697052001953, "reward_std": 0.36755073070526123, "rewards/reward_fn/mean": 3.863697052001953, "rewards/reward_fn/std": 0.36755073070526123, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 136.5625, "completions/mean_terminated_length": 136.5625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.1182772886390156, "frac_reward_zero_std": 1.0, "grad_norm": 0.123046875, "kl": 0.030038248049095273, "learning_rate": 7.5544e-06, "loss": 0.0012, "num_tokens": 51894063.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 412.90625, "completions/mean_terminated_length": 412.90625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.1183833669247905, "frac_reward_zero_std": 0.0, "grad_norm": 1.0546875, "kl": 0.018813129048794508, "learning_rate": 7.554e-06, "loss": -0.0249, "num_tokens": 51940748.0, "reward": 3.8100392818450928, "reward_std": 0.5366294384002686, "rewards/reward_fn/mean": 3.8100392818450928, "rewards/reward_fn/std": 0.5366293787956238, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 297.09375, "completions/mean_terminated_length": 240.61289978027344, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.1184894452105654, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.01792726945132017, "learning_rate": 7.5536e-06, "loss": 0.281, "num_tokens": 51983695.0, "reward": 3.659058094024658, "reward_std": 0.8081573843955994, "rewards/reward_fn/mean": 3.659058094024658, "rewards/reward_fn/std": 0.8081573843955994, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 244.9375, "completions/mean_terminated_length": 244.9375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.1185955234963403, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.02788339671678841, "learning_rate": 7.5532e-06, "loss": 0.0633, "num_tokens": 52025869.0, "reward": 2.9600257873535156, "reward_std": 0.5432107448577881, "rewards/reward_fn/mean": 2.9600257873535156, "rewards/reward_fn/std": 0.5432106852531433, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 109.15625, "completions/mean_terminated_length": 109.15625, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.1187016017821152, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "kl": 0.03897233330644667, "learning_rate": 7.5528e-06, "loss": 0.043, "num_tokens": 52061970.0, "reward": 3.0810916423797607, "reward_std": 0.24325308203697205, "rewards/reward_fn/mean": 3.0810916423797607, "rewards/reward_fn/std": 0.24325308203697205, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 374.53125, "completions/mean_terminated_length": 374.53125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.11880768006789011, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.017800742178224027, "learning_rate": 7.552399999999999e-06, "loss": 0.0929, "num_tokens": 52104931.0, "reward": 3.1744065284729004, "reward_std": 0.5665863156318665, "rewards/reward_fn/mean": 3.1744065284729004, "rewards/reward_fn/std": 0.5665862560272217, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 117.1875, "completions/mean_terminated_length": 117.1875, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.11891375835366501, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.031384273897856474, "learning_rate": 7.551999999999999e-06, "loss": 0.0092, "num_tokens": 52142857.0, "reward": 3.7206034660339355, "reward_std": 0.6296123266220093, "rewards/reward_fn/mean": 3.7206034660339355, "rewards/reward_fn/std": 0.6296123266220093, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1076.0, "completions/max_terminated_length": 1076.0, "completions/mean_length": 336.40625, "completions/mean_terminated_length": 336.40625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.11901983663943991, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.020996463019400835, "learning_rate": 7.551599999999999e-06, "loss": -0.1134, "num_tokens": 52188662.0, "reward": 3.5195388793945312, "reward_std": 0.6306620836257935, "rewards/reward_fn/mean": 3.5195388793945312, "rewards/reward_fn/std": 0.6306621432304382, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 321.84375, "completions/mean_terminated_length": 321.84375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.11912591492521481, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.01977117615751922, "learning_rate": 7.551199999999999e-06, "loss": 0.0567, "num_tokens": 52233617.0, "reward": 3.0178513526916504, "reward_std": 0.6891320943832397, "rewards/reward_fn/mean": 3.0178513526916504, "rewards/reward_fn/std": 0.689132034778595, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1633.0, "completions/max_terminated_length": 1633.0, "completions/mean_length": 547.84375, "completions/mean_terminated_length": 547.84375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.1192319932109897, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.020315645029768348, "learning_rate": 7.5508e-06, "loss": -0.0442, "num_tokens": 52295468.0, "reward": 2.052112340927124, "reward_std": 0.5385510921478271, "rewards/reward_fn/mean": 2.052112340927124, "rewards/reward_fn/std": 0.5385510921478271, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1653.0, "completions/max_terminated_length": 1653.0, "completions/mean_length": 408.5625, "completions/mean_terminated_length": 408.5625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.1193380714967646, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.0202141257468611, "learning_rate": 7.5504e-06, "loss": 0.0394, "num_tokens": 52348126.0, "reward": 2.4590096473693848, "reward_std": 0.5020030736923218, "rewards/reward_fn/mean": 2.4590096473693848, "rewards/reward_fn/std": 0.5020030736923218, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 145.6875, "completions/mean_terminated_length": 145.6875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.11944414978253952, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.03049472742713988, "learning_rate": 7.55e-06, "loss": 0.0012, "num_tokens": 52397780.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 250.96875, "completions/mean_terminated_length": 250.96875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.11955022806831442, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.037999976659193635, "learning_rate": 7.5496e-06, "loss": -0.0338, "num_tokens": 52438483.0, "reward": 2.8391571044921875, "reward_std": 0.20920641720294952, "rewards/reward_fn/mean": 2.8391571044921875, "rewards/reward_fn/std": 0.20920643210411072, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.11965630635408932, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.02898483257740736, "learning_rate": 7.5492e-06, "loss": 0.0341, "num_tokens": 52476727.0, "reward": 3.4120028018951416, "reward_std": 0.562468409538269, "rewards/reward_fn/mean": 3.4120028018951416, "rewards/reward_fn/std": 0.5624683499336243, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 1293.34375, "completions/mean_terminated_length": 1153.5926513671875, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 0.11976238463986422, "frac_reward_zero_std": 0.0, "grad_norm": 0.98828125, "kl": 0.018078222521580756, "learning_rate": 7.5488e-06, "loss": 0.1773, "num_tokens": 52560418.0, "reward": 1.8458271026611328, "reward_std": 0.9291160106658936, "rewards/reward_fn/mean": 1.8458271026611328, "rewards/reward_fn/std": 0.9291160702705383, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 196.375, "completions/mean_terminated_length": 196.375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.11986846292563912, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.021710637491196394, "learning_rate": 7.5484e-06, "loss": 0.0009, "num_tokens": 52611342.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 125.28125, "completions/mean_terminated_length": 125.28125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.11997454121141403, "frac_reward_zero_std": 1.0, "grad_norm": 0.0771484375, "kl": 0.012505219259765, "learning_rate": 7.5479999999999996e-06, "loss": 0.0005, "num_tokens": 52658647.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 140.71875, "completions/mean_terminated_length": 140.71875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.12008061949718893, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.018215686664916575, "learning_rate": 7.5475999999999995e-06, "loss": 0.0007, "num_tokens": 52714286.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 187.4375, "completions/mean_terminated_length": 187.4375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.12018669778296383, "frac_reward_zero_std": 1.0, "grad_norm": 0.1103515625, "kl": 0.02393046487122774, "learning_rate": 7.5471999999999995e-06, "loss": 0.001, "num_tokens": 52756636.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1577.0, "completions/max_terminated_length": 1577.0, "completions/mean_length": 313.84375, "completions/mean_terminated_length": 313.84375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.12029277606873873, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.026081232354044914, "learning_rate": 7.5467999999999995e-06, "loss": 0.0927, "num_tokens": 52795991.0, "reward": 3.3418869972229004, "reward_std": 0.6929866671562195, "rewards/reward_fn/mean": 3.3418869972229004, "rewards/reward_fn/std": 0.6929866671562195, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 190.78125, "completions/mean_terminated_length": 190.78125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.12039885435451363, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.021782017778605223, "learning_rate": 7.5464e-06, "loss": 0.0009, "num_tokens": 52848784.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 210.65625, "completions/mean_terminated_length": 210.65625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.12050493264028854, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.022625808138400316, "learning_rate": 7.546e-06, "loss": 0.0627, "num_tokens": 52891269.0, "reward": 3.7008187770843506, "reward_std": 0.7357956767082214, "rewards/reward_fn/mean": 3.7008187770843506, "rewards/reward_fn/std": 0.7357956767082214, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 94.84375, "completions/mean_terminated_length": 94.84375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.12061101092606344, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.024228210328146815, "learning_rate": 7.5456e-06, "loss": 0.001, "num_tokens": 52945920.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 116.78125, "completions/mean_terminated_length": 116.78125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.12071708921183834, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455078125, "kl": 0.027215805603191257, "learning_rate": 7.5452e-06, "loss": 0.0011, "num_tokens": 52977177.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1196.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 333.34375, "completions/mean_terminated_length": 333.34375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.12082316749761324, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.021321170032024384, "learning_rate": 7.5448e-06, "loss": 0.0052, "num_tokens": 53023300.0, "reward": 3.9064579010009766, "reward_std": 0.2969244122505188, "rewards/reward_fn/mean": 3.9064579010009766, "rewards/reward_fn/std": 0.2969244122505188, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 79.90625, "completions/mean_terminated_length": 79.90625, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.12092924578338814, "frac_reward_zero_std": 0.0, "grad_norm": 3.953125, "kl": 0.018354161293245852, "learning_rate": 7.5444e-06, "loss": -0.0223, "num_tokens": 53060129.0, "reward": 3.9402408599853516, "reward_std": 0.23523320257663727, "rewards/reward_fn/mean": 3.9402408599853516, "rewards/reward_fn/std": 0.23523321747779846, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 429.6875, "completions/mean_terminated_length": 429.6875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.12103532406916304, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.02179452800191939, "learning_rate": 7.543999999999999e-06, "loss": 0.01, "num_tokens": 53106231.0, "reward": 2.8876233100891113, "reward_std": 0.218718022108078, "rewards/reward_fn/mean": 2.8876233100891113, "rewards/reward_fn/std": 0.21871797740459442, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1471.0, "completions/max_terminated_length": 1471.0, "completions/mean_length": 166.875, "completions/mean_terminated_length": 166.875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.12114140235493795, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.03363496996462345, "learning_rate": 7.543599999999999e-06, "loss": 0.0013, "num_tokens": 53162131.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 206.1875, "completions/mean_terminated_length": 206.1875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.12124748064071285, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.017744799610227346, "learning_rate": 7.543199999999999e-06, "loss": 0.0345, "num_tokens": 53198905.0, "reward": 2.7245864868164062, "reward_std": 0.032482411712408066, "rewards/reward_fn/mean": 2.7245864868164062, "rewards/reward_fn/std": 0.032482437789440155, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 142.875, "completions/mean_terminated_length": 142.875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.12135355892648775, "frac_reward_zero_std": 1.0, "grad_norm": 0.30859375, "kl": 0.036823015194386244, "learning_rate": 7.542799999999999e-06, "loss": 0.0015, "num_tokens": 53243669.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 206.75, "completions/mean_terminated_length": 206.75, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.12145963721226265, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.023123034741729498, "learning_rate": 7.542399999999999e-06, "loss": -0.0627, "num_tokens": 53288429.0, "reward": 2.7684268951416016, "reward_std": 0.05548159033060074, "rewards/reward_fn/mean": 2.7684268951416016, "rewards/reward_fn/std": 0.055481597781181335, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 479.15625, "completions/mean_terminated_length": 479.15625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.12156571549803755, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.017751626786775887, "learning_rate": 7.541999999999999e-06, "loss": -0.0229, "num_tokens": 53358578.0, "reward": 2.996565580368042, "reward_std": 0.6444182991981506, "rewards/reward_fn/mean": 2.996565580368042, "rewards/reward_fn/std": 0.6444182395935059, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 200.71875, "completions/mean_terminated_length": 200.71875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.12167179378381246, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.029425512766465545, "learning_rate": 7.5416e-06, "loss": 0.1157, "num_tokens": 53400937.0, "reward": 3.5051753520965576, "reward_std": 0.6135579943656921, "rewards/reward_fn/mean": 3.5051753520965576, "rewards/reward_fn/std": 0.6135579347610474, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 305.125, "completions/mean_terminated_length": 305.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.12177787206958736, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.020469692768529058, "learning_rate": 7.5412e-06, "loss": 0.0535, "num_tokens": 53450349.0, "reward": 2.800945997238159, "reward_std": 0.287587434053421, "rewards/reward_fn/mean": 2.800945997238159, "rewards/reward_fn/std": 0.28758740425109863, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 209.21875, "completions/mean_terminated_length": 209.21875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.12188395035536226, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.022445352748036385, "learning_rate": 7.5408e-06, "loss": -0.0242, "num_tokens": 53515540.0, "reward": 3.882132053375244, "reward_std": 0.372415155172348, "rewards/reward_fn/mean": 3.882132053375244, "rewards/reward_fn/std": 0.372415155172348, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 926.0, "completions/max_terminated_length": 926.0, "completions/mean_length": 227.34375, "completions/mean_terminated_length": 227.34375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.12199002864113716, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.020551395835354924, "learning_rate": 7.5404e-06, "loss": 0.0163, "num_tokens": 53560575.0, "reward": 2.9072699546813965, "reward_std": 0.2896516025066376, "rewards/reward_fn/mean": 2.9072699546813965, "rewards/reward_fn/std": 0.2896515727043152, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 262.375, "completions/mean_terminated_length": 262.375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.12209610692691206, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.024549536872655153, "learning_rate": 7.54e-06, "loss": 0.0296, "num_tokens": 53605867.0, "reward": 2.8168137073516846, "reward_std": 0.31276002526283264, "rewards/reward_fn/mean": 2.8168137073516846, "rewards/reward_fn/std": 0.31275999546051025, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1899.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 856.65625, "completions/mean_terminated_length": 856.65625, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "epoch": 0.12220218521268696, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.01613958622328937, "learning_rate": 7.5396e-06, "loss": 0.1343, "num_tokens": 53658240.0, "reward": 2.469219207763672, "reward_std": 0.45067059993743896, "rewards/reward_fn/mean": 2.469219207763672, "rewards/reward_fn/std": 0.4506705403327942, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 313.40625, "completions/mean_terminated_length": 313.40625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.12230826349846187, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.026833078358322382, "learning_rate": 7.5392e-06, "loss": 0.0304, "num_tokens": 53720845.0, "reward": 2.7918012142181396, "reward_std": 0.5359998941421509, "rewards/reward_fn/mean": 2.7918012142181396, "rewards/reward_fn/std": 0.5359998941421509, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 152.15625, "completions/mean_terminated_length": 152.15625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.12241434178423677, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.0193988133687526, "learning_rate": 7.5388e-06, "loss": 0.0837, "num_tokens": 53770290.0, "reward": 2.6622109413146973, "reward_std": 0.07657734304666519, "rewards/reward_fn/mean": 2.6622109413146973, "rewards/reward_fn/std": 0.0765773355960846, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 345.46875, "completions/mean_terminated_length": 345.46875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.12252042007001167, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.01845894439611584, "learning_rate": 7.5384e-06, "loss": 0.0443, "num_tokens": 53810017.0, "reward": 2.8654346466064453, "reward_std": 0.06292664259672165, "rewards/reward_fn/mean": 2.8654346466064453, "rewards/reward_fn/std": 0.06292665004730225, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 109.0625, "completions/mean_terminated_length": 109.0625, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.12262649835578657, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.02120021218433976, "learning_rate": 7.538e-06, "loss": 0.0008, "num_tokens": 53857475.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 180.1875, "completions/mean_terminated_length": 180.1875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.12273257664156147, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.021761987125501037, "learning_rate": 7.5376e-06, "loss": 0.0012, "num_tokens": 53895881.0, "reward": 3.3576347827911377, "reward_std": 0.5774080157279968, "rewards/reward_fn/mean": 3.3576347827911377, "rewards/reward_fn/std": 0.5774080157279968, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1430.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 371.28125, "completions/mean_terminated_length": 371.28125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.12283865492733638, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.02653820696286857, "learning_rate": 7.5372e-06, "loss": 0.0207, "num_tokens": 53918706.0, "reward": 3.2473506927490234, "reward_std": 0.6305427551269531, "rewards/reward_fn/mean": 3.2473506927490234, "rewards/reward_fn/std": 0.6305428147315979, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 132.34375, "completions/mean_terminated_length": 132.34375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.12294473321311128, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.02793702925555408, "learning_rate": 7.5368e-06, "loss": 0.0161, "num_tokens": 53953661.0, "reward": 3.8980746269226074, "reward_std": 0.3230654299259186, "rewards/reward_fn/mean": 3.8980746269226074, "rewards/reward_fn/std": 0.3230654001235962, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 207.53125, "completions/mean_terminated_length": 207.53125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.12305081149888618, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.029348223470151424, "learning_rate": 7.5364e-06, "loss": 0.0201, "num_tokens": 53978894.0, "reward": 3.4990692138671875, "reward_std": 0.6988479495048523, "rewards/reward_fn/mean": 3.4990692138671875, "rewards/reward_fn/std": 0.6988478899002075, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 216.6875, "completions/mean_terminated_length": 216.6875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.12315688978466108, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.03555730218067765, "learning_rate": 7.5359999999999995e-06, "loss": 0.0271, "num_tokens": 54005508.0, "reward": 3.8622612953186035, "reward_std": 0.37032949924468994, "rewards/reward_fn/mean": 3.8622612953186035, "rewards/reward_fn/std": 0.37032952904701233, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1203.0, "completions/max_terminated_length": 1203.0, "completions/mean_length": 353.6875, "completions/mean_terminated_length": 353.6875, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.12326296807043598, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.018009544699452817, "learning_rate": 7.5355999999999995e-06, "loss": -0.0023, "num_tokens": 54058490.0, "reward": 3.888385772705078, "reward_std": 0.3552592396736145, "rewards/reward_fn/mean": 3.888385772705078, "rewards/reward_fn/std": 0.3552592098712921, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 222.125, "completions/mean_terminated_length": 222.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.12336904635621089, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.02037247340194881, "learning_rate": 7.5351999999999994e-06, "loss": 0.0008, "num_tokens": 54102334.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 307.5625, "completions/mean_terminated_length": 307.5625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.12347512464198579, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.026630163891240954, "learning_rate": 7.534799999999999e-06, "loss": 0.0552, "num_tokens": 54132944.0, "reward": 3.7399346828460693, "reward_std": 0.5008683800697327, "rewards/reward_fn/mean": 3.7399346828460693, "rewards/reward_fn/std": 0.5008684396743774, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1234.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 327.96875, "completions/mean_terminated_length": 327.96875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.12358120292776069, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.024272862123325467, "learning_rate": 7.534399999999999e-06, "loss": 0.2175, "num_tokens": 54182607.0, "reward": 3.42256236076355, "reward_std": 1.0162280797958374, "rewards/reward_fn/mean": 3.42256236076355, "rewards/reward_fn/std": 1.0162280797958374, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 255.90625, "completions/mean_terminated_length": 255.90625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.12368728121353559, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.02412836579605937, "learning_rate": 7.533999999999999e-06, "loss": 0.1033, "num_tokens": 54237196.0, "reward": 3.846345901489258, "reward_std": 0.4130762219429016, "rewards/reward_fn/mean": 3.846345901489258, "rewards/reward_fn/std": 0.4130762219429016, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 366.46875, "completions/mean_terminated_length": 366.46875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.12379335949931049, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.024113824125379324, "learning_rate": 7.533599999999999e-06, "loss": 0.003, "num_tokens": 54280251.0, "reward": 3.4150004386901855, "reward_std": 0.6626617312431335, "rewards/reward_fn/mean": 3.4150004386901855, "rewards/reward_fn/std": 0.6626616716384888, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 200.25, "completions/mean_terminated_length": 200.25, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.12389943778508539, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.025690929731354117, "learning_rate": 7.533199999999999e-06, "loss": 0.0769, "num_tokens": 54321059.0, "reward": 2.788593053817749, "reward_std": 0.048831358551979065, "rewards/reward_fn/mean": 2.788593053817749, "rewards/reward_fn/std": 0.048831358551979065, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1791.0, "completions/max_terminated_length": 1791.0, "completions/mean_length": 496.5625, "completions/mean_terminated_length": 496.5625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.1240055160708603, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.017352971248328686, "learning_rate": 7.532799999999999e-06, "loss": -0.0641, "num_tokens": 54375925.0, "reward": 3.188598871231079, "reward_std": 0.6397762894630432, "rewards/reward_fn/mean": 3.188598871231079, "rewards/reward_fn/std": 0.6397762298583984, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 142.9375, "completions/mean_terminated_length": 142.9375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.1241115943566352, "frac_reward_zero_std": 1.0, "grad_norm": 0.2451171875, "kl": 0.06317757535725832, "learning_rate": 7.532399999999999e-06, "loss": 0.0025, "num_tokens": 54416211.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 146.0625, "completions/mean_terminated_length": 146.0625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.1242176726424101, "frac_reward_zero_std": 1.0, "grad_norm": 0.138671875, "kl": 0.02488727425225079, "learning_rate": 7.532e-06, "loss": 0.001, "num_tokens": 54455445.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 187.5625, "completions/mean_terminated_length": 187.5625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.124323750928185, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.026105482364073396, "learning_rate": 7.5316e-06, "loss": 0.0265, "num_tokens": 54492679.0, "reward": 3.9259839057922363, "reward_std": 0.291323721408844, "rewards/reward_fn/mean": 3.9259839057922363, "rewards/reward_fn/std": 0.291323721408844, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 231.875, "completions/mean_terminated_length": 231.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.1244298292139599, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.015331451082602143, "learning_rate": 7.5312e-06, "loss": 0.0222, "num_tokens": 54553795.0, "reward": 3.9769577980041504, "reward_std": 0.13034707307815552, "rewards/reward_fn/mean": 3.9769577980041504, "rewards/reward_fn/std": 0.1303471028804779, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 100.8125, "completions/mean_terminated_length": 100.8125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.12453590749973481, "frac_reward_zero_std": 1.0, "grad_norm": 0.193359375, "kl": 0.03184149111621082, "learning_rate": 7.5308e-06, "loss": 0.0013, "num_tokens": 54591645.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1491.0, "completions/max_terminated_length": 1491.0, "completions/mean_length": 347.1875, "completions/mean_terminated_length": 347.1875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.12464198578550971, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.027322919107973576, "learning_rate": 7.5304e-06, "loss": -0.2403, "num_tokens": 54634275.0, "reward": 2.2521934509277344, "reward_std": 0.5425410866737366, "rewards/reward_fn/mean": 2.2521934509277344, "rewards/reward_fn/std": 0.5425410866737366, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 264.90625, "completions/mean_terminated_length": 264.90625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.12474806407128461, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.01803676167037338, "learning_rate": 7.53e-06, "loss": -0.0299, "num_tokens": 54664704.0, "reward": 3.8208837509155273, "reward_std": 0.4239185154438019, "rewards/reward_fn/mean": 3.8208837509155273, "rewards/reward_fn/std": 0.4239185154438019, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1419.0, "completions/max_terminated_length": 1419.0, "completions/mean_length": 380.90625, "completions/mean_terminated_length": 380.90625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.12485414235705951, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.02894961880519986, "learning_rate": 7.5296e-06, "loss": -0.0024, "num_tokens": 54708189.0, "reward": 3.098309278488159, "reward_std": 0.35013359785079956, "rewards/reward_fn/mean": 3.098309278488159, "rewards/reward_fn/std": 0.35013359785079956, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 196.8125, "completions/mean_terminated_length": 196.8125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.12496022064283441, "frac_reward_zero_std": 1.0, "grad_norm": 0.08251953125, "kl": 0.020095350686460733, "learning_rate": 7.5292e-06, "loss": 0.0008, "num_tokens": 54760727.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 180.09375, "completions/mean_terminated_length": 180.09375, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.12506629892860932, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.028536976547911763, "learning_rate": 7.5288e-06, "loss": 0.1951, "num_tokens": 54812314.0, "reward": 3.8505232334136963, "reward_std": 0.40203312039375305, "rewards/reward_fn/mean": 3.8505232334136963, "rewards/reward_fn/std": 0.40203315019607544, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 140.3125, "completions/mean_terminated_length": 140.3125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.1251723772143842, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.028160194400697947, "learning_rate": 7.5284e-06, "loss": 0.0011, "num_tokens": 54844484.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 165.625, "completions/mean_terminated_length": 165.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.12527845550015912, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.018880587071180344, "learning_rate": 7.527999999999999e-06, "loss": 0.0008, "num_tokens": 54892440.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 108.75, "completions/mean_terminated_length": 108.75, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.12538453378593403, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.030261845095083117, "learning_rate": 7.527599999999999e-06, "loss": 0.0012, "num_tokens": 54919440.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 353.9375, "completions/mean_terminated_length": 353.9375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.12549061207170892, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.019826365518383682, "learning_rate": 7.5272e-06, "loss": 0.0081, "num_tokens": 54949006.0, "reward": 3.937495231628418, "reward_std": 0.24625274538993835, "rewards/reward_fn/mean": 3.937495231628418, "rewards/reward_fn/std": 0.24625271558761597, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 138.625, "completions/mean_terminated_length": 138.625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.12559669035748383, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.02072442672215402, "learning_rate": 7.5268e-06, "loss": 0.0357, "num_tokens": 54983426.0, "reward": 2.824800968170166, "reward_std": 0.023115260526537895, "rewards/reward_fn/mean": 2.824800968170166, "rewards/reward_fn/std": 0.023115256801247597, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1097.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 375.9375, "completions/mean_terminated_length": 375.9375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.12570276864325872, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.01736769184935838, "learning_rate": 7.5264e-06, "loss": -0.0127, "num_tokens": 55026240.0, "reward": 3.8968665599823, "reward_std": 0.33643701672554016, "rewards/reward_fn/mean": 3.8968665599823, "rewards/reward_fn/std": 0.3364369869232178, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 286.40625, "completions/mean_terminated_length": 286.40625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.12580884692903363, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.021791806211695075, "learning_rate": 7.526e-06, "loss": -0.0103, "num_tokens": 55069293.0, "reward": 3.151297092437744, "reward_std": 0.42361894249916077, "rewards/reward_fn/mean": 3.151297092437744, "rewards/reward_fn/std": 0.4236189126968384, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 231.46875, "completions/mean_terminated_length": 231.46875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.12591492521480852, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.022306752856820822, "learning_rate": 7.5255999999999996e-06, "loss": 0.0999, "num_tokens": 55109244.0, "reward": 2.9199135303497314, "reward_std": 0.038811203092336655, "rewards/reward_fn/mean": 2.9199135303497314, "rewards/reward_fn/std": 0.03881121799349785, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1251.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 444.8125, "completions/mean_terminated_length": 444.8125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.12602100350058343, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.016548768035136163, "learning_rate": 7.5251999999999995e-06, "loss": -0.0012, "num_tokens": 55156630.0, "reward": 2.838229179382324, "reward_std": 0.38113701343536377, "rewards/reward_fn/mean": 2.838229179382324, "rewards/reward_fn/std": 0.381136953830719, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 224.09375, "completions/mean_terminated_length": 224.09375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.12612708178635834, "frac_reward_zero_std": 1.0, "grad_norm": 0.09326171875, "kl": 0.022972247563302517, "learning_rate": 7.5247999999999995e-06, "loss": 0.0009, "num_tokens": 55201497.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 227.125, "completions/mean_terminated_length": 227.125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.12623316007213323, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.016570686595514417, "learning_rate": 7.5243999999999995e-06, "loss": 0.0007, "num_tokens": 55253885.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 120.625, "completions/mean_terminated_length": 120.625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.12633923835790814, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830078125, "kl": 0.01669433683855459, "learning_rate": 7.5239999999999995e-06, "loss": 0.0007, "num_tokens": 55299857.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 317.1875, "completions/mean_terminated_length": 317.1875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.12644531664368303, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.017451874213293195, "learning_rate": 7.523599999999999e-06, "loss": 0.042, "num_tokens": 55343415.0, "reward": 3.317841053009033, "reward_std": 0.36798393726348877, "rewards/reward_fn/mean": 3.317841053009033, "rewards/reward_fn/std": 0.3679839074611664, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 209.03125, "completions/mean_terminated_length": 209.03125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.12655139492945794, "frac_reward_zero_std": 1.0, "grad_norm": 0.10546875, "kl": 0.02464919281192124, "learning_rate": 7.523199999999999e-06, "loss": 0.001, "num_tokens": 55384440.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 233.65625, "completions/mean_terminated_length": 233.65625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.12665747321523285, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.018277527298778296, "learning_rate": 7.5228e-06, "loss": 0.1391, "num_tokens": 55425453.0, "reward": 2.884368419647217, "reward_std": 0.03824806585907936, "rewards/reward_fn/mean": 2.884368419647217, "rewards/reward_fn/std": 0.038248054683208466, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1284.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 469.59375, "completions/mean_terminated_length": 469.59375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.12676355150100774, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.022253156173974276, "learning_rate": 7.5224e-06, "loss": 0.1067, "num_tokens": 55476704.0, "reward": 2.19340443611145, "reward_std": 0.5337251424789429, "rewards/reward_fn/mean": 2.19340443611145, "rewards/reward_fn/std": 0.5337251424789429, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 283.71875, "completions/mean_terminated_length": 283.71875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.12686962978678265, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.02405042969621718, "learning_rate": 7.522e-06, "loss": 0.0576, "num_tokens": 55530359.0, "reward": 3.064527988433838, "reward_std": 0.7311015725135803, "rewards/reward_fn/mean": 3.064527988433838, "rewards/reward_fn/std": 0.7311015725135803, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1483.0, "completions/max_terminated_length": 1483.0, "completions/mean_length": 494.71875, "completions/mean_terminated_length": 494.71875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.12697570807255754, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.03502373583614826, "learning_rate": 7.5216e-06, "loss": -0.0091, "num_tokens": 55589870.0, "reward": 2.738553524017334, "reward_std": 0.32435670495033264, "rewards/reward_fn/mean": 2.738553524017334, "rewards/reward_fn/std": 0.32435664534568787, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 295.40625, "completions/mean_terminated_length": 295.40625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.12708178635833245, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.021748598664999008, "learning_rate": 7.5212e-06, "loss": 0.0009, "num_tokens": 55637723.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 188.15625, "completions/mean_terminated_length": 188.15625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.12718786464410736, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.032654849579557776, "learning_rate": 7.5208e-06, "loss": 0.0252, "num_tokens": 55684416.0, "reward": 3.786581039428711, "reward_std": 0.40999314188957214, "rewards/reward_fn/mean": 3.786581039428711, "rewards/reward_fn/std": 0.40999317169189453, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 433.5, "completions/mean_terminated_length": 381.4193420410156, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.12729394292988225, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.02767302538268268, "learning_rate": 7.5204e-06, "loss": 0.1638, "num_tokens": 55738768.0, "reward": 2.4253549575805664, "reward_std": 0.6796978712081909, "rewards/reward_fn/mean": 2.4253549575805664, "rewards/reward_fn/std": 0.6796978712081909, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 178.625, "completions/mean_terminated_length": 178.625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.12740002121565716, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.023716250201687217, "learning_rate": 7.519999999999999e-06, "loss": 0.03, "num_tokens": 55773892.0, "reward": 3.820127487182617, "reward_std": 0.5937113165855408, "rewards/reward_fn/mean": 3.820127487182617, "rewards/reward_fn/std": 0.5937113761901855, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 196.0, "completions/mean_terminated_length": 196.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.12750609950143205, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.026854999363422394, "learning_rate": 7.519599999999999e-06, "loss": 0.0397, "num_tokens": 55812132.0, "reward": 2.892002582550049, "reward_std": 0.21918649971485138, "rewards/reward_fn/mean": 2.892002582550049, "rewards/reward_fn/std": 0.21918649971485138, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 115.59375, "completions/mean_terminated_length": 115.59375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.12761217778720696, "frac_reward_zero_std": 0.0, "grad_norm": 3.34375, "kl": 0.028977030655369163, "learning_rate": 7.519199999999999e-06, "loss": 0.0267, "num_tokens": 55845655.0, "reward": 3.7437262535095215, "reward_std": 0.24018844962120056, "rewards/reward_fn/mean": 3.7437262535095215, "rewards/reward_fn/std": 0.24018843472003937, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1056.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 576.625, "completions/mean_terminated_length": 576.625, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.12771825607298187, "frac_reward_zero_std": 0.0, "grad_norm": 1.0859375, "kl": 0.021841128589585423, "learning_rate": 7.518799999999999e-06, "loss": -0.0479, "num_tokens": 55903275.0, "reward": 2.7085070610046387, "reward_std": 0.27952468395233154, "rewards/reward_fn/mean": 2.7085070610046387, "rewards/reward_fn/std": 0.2795247435569763, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 250.96875, "completions/mean_terminated_length": 250.96875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.12782433435875676, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.02591366902925074, "learning_rate": 7.518399999999999e-06, "loss": -0.0142, "num_tokens": 55954634.0, "reward": 3.8671493530273438, "reward_std": 0.4651517868041992, "rewards/reward_fn/mean": 3.8671493530273438, "rewards/reward_fn/std": 0.46515172719955444, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 998.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 159.65625, "completions/mean_terminated_length": 159.65625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.12793041264453167, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.031389163341373205, "learning_rate": 7.518e-06, "loss": -0.0059, "num_tokens": 55992159.0, "reward": 3.941786527633667, "reward_std": 0.22907233238220215, "rewards/reward_fn/mean": 3.941786527633667, "rewards/reward_fn/std": 0.22907236218452454, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1829.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 441.4375, "completions/mean_terminated_length": 441.4375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.12803649093030656, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.021549456054344773, "learning_rate": 7.5176e-06, "loss": -0.0619, "num_tokens": 56038477.0, "reward": 1.9434212446212769, "reward_std": 0.4184891879558563, "rewards/reward_fn/mean": 1.9434212446212769, "rewards/reward_fn/std": 0.41848915815353394, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 252.65625, "completions/mean_terminated_length": 252.65625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.12814256921608147, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.022802669554948807, "learning_rate": 7.5172e-06, "loss": 0.0009, "num_tokens": 56080226.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 135.28125, "completions/mean_terminated_length": 135.28125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.12824864750185638, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.029397084843367338, "learning_rate": 7.5168e-06, "loss": 0.0055, "num_tokens": 56099915.0, "reward": 3.7214303016662598, "reward_std": 0.748846709728241, "rewards/reward_fn/mean": 3.7214303016662598, "rewards/reward_fn/std": 0.748846709728241, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 223.21875, "completions/mean_terminated_length": 223.21875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.12835472578763127, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.019289852352812886, "learning_rate": 7.5164e-06, "loss": 0.0008, "num_tokens": 56141554.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1602.0, "completions/max_terminated_length": 1602.0, "completions/mean_length": 383.5, "completions/mean_terminated_length": 383.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.12846080407340618, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.021108913468196988, "learning_rate": 7.516e-06, "loss": 0.1666, "num_tokens": 56186946.0, "reward": 2.6080212593078613, "reward_std": 0.37626245617866516, "rewards/reward_fn/mean": 2.6080212593078613, "rewards/reward_fn/std": 0.37626245617866516, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1582.0, "completions/max_terminated_length": 1582.0, "completions/mean_length": 310.46875, "completions/mean_terminated_length": 310.46875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.12856688235918107, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.031106388196349144, "learning_rate": 7.5156e-06, "loss": 0.0356, "num_tokens": 56225809.0, "reward": 3.812455654144287, "reward_std": 0.5331630110740662, "rewards/reward_fn/mean": 3.812455654144287, "rewards/reward_fn/std": 0.5331630110740662, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 234.3125, "completions/mean_terminated_length": 234.3125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.12867296064495598, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.02681015362031758, "learning_rate": 7.5152e-06, "loss": 0.0011, "num_tokens": 56271995.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 182.875, "completions/mean_terminated_length": 182.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.12877903893073087, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.02213688869960606, "learning_rate": 7.5148e-06, "loss": -0.0091, "num_tokens": 56311767.0, "reward": 3.972648859024048, "reward_std": 0.15472157299518585, "rewards/reward_fn/mean": 3.972648859024048, "rewards/reward_fn/std": 0.15472158789634705, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 173.0, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.12888511721650578, "frac_reward_zero_std": 1.0, "grad_norm": 0.103515625, "kl": 0.021893349941819906, "learning_rate": 7.5144e-06, "loss": 0.0009, "num_tokens": 56351543.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 373.15625, "completions/mean_terminated_length": 373.15625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.1289911955022807, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.018810822628438473, "learning_rate": 7.5139999999999995e-06, "loss": 0.023, "num_tokens": 56409276.0, "reward": 2.859402656555176, "reward_std": 0.30087754130363464, "rewards/reward_fn/mean": 2.859402656555176, "rewards/reward_fn/std": 0.30087754130363464, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 824.4375, "completions/mean_terminated_length": 784.9677124023438, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 0.12909727378805558, "frac_reward_zero_std": 0.0, "grad_norm": 1.109375, "kl": 0.019085350446403027, "learning_rate": 7.5135999999999995e-06, "loss": 0.0753, "num_tokens": 56473194.0, "reward": 2.612720012664795, "reward_std": 0.8761619925498962, "rewards/reward_fn/mean": 2.612720012664795, "rewards/reward_fn/std": 0.8761619329452515, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 309.84375, "completions/mean_terminated_length": 309.84375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.1292033520738305, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.025362204294651747, "learning_rate": 7.5132e-06, "loss": 0.1876, "num_tokens": 56515589.0, "reward": 3.943398952484131, "reward_std": 0.22278046607971191, "rewards/reward_fn/mean": 3.943398952484131, "rewards/reward_fn/std": 0.22278045117855072, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1469.0, "completions/max_terminated_length": 1469.0, "completions/mean_length": 458.5625, "completions/mean_terminated_length": 458.5625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.12930943035960538, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.019134088302962482, "learning_rate": 7.5128e-06, "loss": 0.0589, "num_tokens": 56561207.0, "reward": 2.810291051864624, "reward_std": 0.20179764926433563, "rewards/reward_fn/mean": 2.810291051864624, "rewards/reward_fn/std": 0.20179766416549683, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 123.375, "completions/mean_terminated_length": 123.375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.1294155086453803, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.031952549470588565, "learning_rate": 7.5124e-06, "loss": 0.0337, "num_tokens": 56600451.0, "reward": 2.794665813446045, "reward_std": 0.021768808364868164, "rewards/reward_fn/mean": 2.794665813446045, "rewards/reward_fn/std": 0.02176877297461033, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 154.96875, "completions/mean_terminated_length": 154.96875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.1295215869311552, "frac_reward_zero_std": 1.0, "grad_norm": 0.10498046875, "kl": 0.02546194172464311, "learning_rate": 7.511999999999999e-06, "loss": 0.001, "num_tokens": 56625634.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 224.4375, "completions/mean_terminated_length": 224.4375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.1296276652169301, "frac_reward_zero_std": 1.0, "grad_norm": 0.1015625, "kl": 0.02249453659169376, "learning_rate": 7.511599999999999e-06, "loss": 0.0009, "num_tokens": 56664080.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 250.90625, "completions/mean_terminated_length": 250.90625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.129733743502705, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.0323591826017946, "learning_rate": 7.511199999999999e-06, "loss": -0.0297, "num_tokens": 56708525.0, "reward": 2.691908359527588, "reward_std": 0.03684841841459274, "rewards/reward_fn/mean": 2.691908359527588, "rewards/reward_fn/std": 0.03684840723872185, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 304.40625, "completions/mean_terminated_length": 304.40625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.1298398217884799, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.02477300027385354, "learning_rate": 7.510799999999999e-06, "loss": 0.0578, "num_tokens": 56751418.0, "reward": 3.2451024055480957, "reward_std": 0.5951552391052246, "rewards/reward_fn/mean": 3.2451024055480957, "rewards/reward_fn/std": 0.5951551795005798, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 221.0, "completions/mean_terminated_length": 221.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.1299459000742548, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.02321016346104443, "learning_rate": 7.510399999999999e-06, "loss": 0.0499, "num_tokens": 56774074.0, "reward": 3.972011089324951, "reward_std": 0.1583283692598343, "rewards/reward_fn/mean": 3.972011089324951, "rewards/reward_fn/std": 0.1583283543586731, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 232.59375, "completions/mean_terminated_length": 232.59375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.1300519783600297, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.027153413044288754, "learning_rate": 7.509999999999999e-06, "loss": 0.0753, "num_tokens": 56813293.0, "reward": 2.9659242630004883, "reward_std": 0.0341903492808342, "rewards/reward_fn/mean": 2.9659242630004883, "rewards/reward_fn/std": 0.0341903492808342, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1277.0, "completions/max_terminated_length": 1277.0, "completions/mean_length": 396.53125, "completions/mean_terminated_length": 396.53125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.1301580566458046, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.022763823391869664, "learning_rate": 7.509599999999999e-06, "loss": 0.0602, "num_tokens": 56865662.0, "reward": 3.668447494506836, "reward_std": 0.6870428919792175, "rewards/reward_fn/mean": 3.668447494506836, "rewards/reward_fn/std": 0.6870428323745728, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 259.125, "completions/mean_terminated_length": 259.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.1302641349315795, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.02840983378700912, "learning_rate": 7.509199999999999e-06, "loss": 0.001, "num_tokens": 56907266.0, "reward": 3.0267953872680664, "reward_std": 0.18145422637462616, "rewards/reward_fn/mean": 3.0267953872680664, "rewards/reward_fn/std": 0.18145416676998138, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 509.75, "completions/mean_terminated_length": 509.75, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.1303702132173544, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.021488810423761606, "learning_rate": 7.508799999999999e-06, "loss": -0.0026, "num_tokens": 56957146.0, "reward": 2.784040927886963, "reward_std": 0.219995379447937, "rewards/reward_fn/mean": 2.784040927886963, "rewards/reward_fn/std": 0.21999536454677582, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1945.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 609.0, "completions/mean_terminated_length": 609.0, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.1304762915031293, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.022416489897295833, "learning_rate": 7.5084e-06, "loss": 0.1618, "num_tokens": 57012602.0, "reward": 3.59625244140625, "reward_std": 0.8699040412902832, "rewards/reward_fn/mean": 3.59625244140625, "rewards/reward_fn/std": 0.8699040412902832, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 319.96875, "completions/mean_terminated_length": 319.96875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.13058236978890422, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.025042463559657335, "learning_rate": 7.508e-06, "loss": 0.003, "num_tokens": 57064185.0, "reward": 3.126904010772705, "reward_std": 0.5139070749282837, "rewards/reward_fn/mean": 3.126904010772705, "rewards/reward_fn/std": 0.5139070749282837, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 170.34375, "completions/mean_terminated_length": 170.34375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.1306884480746791, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.024505676236003637, "learning_rate": 7.5076e-06, "loss": 0.07, "num_tokens": 57102692.0, "reward": 3.0724833011627197, "reward_std": 0.03876578435301781, "rewards/reward_fn/mean": 3.0724833011627197, "rewards/reward_fn/std": 0.03876576945185661, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 105.03125, "completions/mean_terminated_length": 105.03125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.13079452636045402, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.01858626154717058, "learning_rate": 7.5072e-06, "loss": 0.0007, "num_tokens": 57171781.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 90.375, "completions/mean_terminated_length": 90.375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.1309006046462289, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.027592405676841736, "learning_rate": 7.5068e-06, "loss": 0.0011, "num_tokens": 57202545.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 732.09375, "completions/mean_terminated_length": 732.09375, "completions/min_length": 424.0, "completions/min_terminated_length": 424.0, "epoch": 0.13100668293200382, "frac_reward_zero_std": 0.0, "grad_norm": 0.984375, "kl": 0.019199739210307598, "learning_rate": 7.5064e-06, "loss": 0.0475, "num_tokens": 57265332.0, "reward": 2.7055556774139404, "reward_std": 0.27973470091819763, "rewards/reward_fn/mean": 2.7055556774139404, "rewards/reward_fn/std": 0.27973467111587524, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 364.5, "completions/mean_terminated_length": 364.5, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.13111276121777873, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.028548541711643338, "learning_rate": 7.506e-06, "loss": -0.02, "num_tokens": 57320836.0, "reward": 2.947634696960449, "reward_std": 0.29028210043907166, "rewards/reward_fn/mean": 2.947634696960449, "rewards/reward_fn/std": 0.29028213024139404, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 192.84375, "completions/mean_terminated_length": 192.84375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.13121883950355362, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.029802830889821053, "learning_rate": 7.5056e-06, "loss": 0.0012, "num_tokens": 57362975.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1456.0, "completions/max_terminated_length": 1456.0, "completions/mean_length": 424.59375, "completions/mean_terminated_length": 424.59375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.13132491778932853, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.021809349535033107, "learning_rate": 7.5052e-06, "loss": -0.0274, "num_tokens": 57420690.0, "reward": 3.653506278991699, "reward_std": 0.635819137096405, "rewards/reward_fn/mean": 3.653506278991699, "rewards/reward_fn/std": 0.6358190774917603, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 248.40625, "completions/mean_terminated_length": 248.40625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.13143099607510342, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.026568999979645014, "learning_rate": 7.5048e-06, "loss": 0.054, "num_tokens": 57458239.0, "reward": 2.9668123722076416, "reward_std": 0.20671309530735016, "rewards/reward_fn/mean": 2.9668123722076416, "rewards/reward_fn/std": 0.20671308040618896, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 183.4375, "completions/mean_terminated_length": 183.4375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.13153707436087833, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.02611563727259636, "learning_rate": 7.5044e-06, "loss": 0.1308, "num_tokens": 57515629.0, "reward": 3.3227896690368652, "reward_std": 0.5340291857719421, "rewards/reward_fn/mean": 3.3227896690368652, "rewards/reward_fn/std": 0.5340291857719421, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 172.84375, "completions/mean_terminated_length": 172.84375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.13164315264665322, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.02926379698328674, "learning_rate": 7.503999999999999e-06, "loss": 0.0133, "num_tokens": 57562088.0, "reward": 3.874748706817627, "reward_std": 0.452591210603714, "rewards/reward_fn/mean": 3.874748706817627, "rewards/reward_fn/std": 0.452591210603714, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 212.15625, "completions/mean_terminated_length": 212.15625, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.13174923093242813, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.03235385986045003, "learning_rate": 7.5036e-06, "loss": 0.0707, "num_tokens": 57610029.0, "reward": 3.885643720626831, "reward_std": 0.3627666234970093, "rewards/reward_fn/mean": 3.885643720626831, "rewards/reward_fn/std": 0.3627666234970093, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 711.53125, "completions/mean_terminated_length": 668.4193115234375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.13185530921820304, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.019830133765935898, "learning_rate": 7.5032e-06, "loss": 0.2383, "num_tokens": 57672158.0, "reward": 2.4883432388305664, "reward_std": 0.572623610496521, "rewards/reward_fn/mean": 2.4883432388305664, "rewards/reward_fn/std": 0.5726235508918762, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1099.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 352.71875, "completions/mean_terminated_length": 352.71875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.13196138750397793, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.02547502121888101, "learning_rate": 7.5027999999999996e-06, "loss": 0.0176, "num_tokens": 57717909.0, "reward": 2.8250460624694824, "reward_std": 0.2939043641090393, "rewards/reward_fn/mean": 2.8250460624694824, "rewards/reward_fn/std": 0.2939044237136841, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 167.5625, "completions/mean_terminated_length": 167.5625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.13206746578975284, "frac_reward_zero_std": 0.0, "grad_norm": 3.25, "kl": 0.04957319051027298, "learning_rate": 7.5023999999999995e-06, "loss": 0.1431, "num_tokens": 57772551.0, "reward": 3.9671456813812256, "reward_std": 0.1858520656824112, "rewards/reward_fn/mean": 3.9671456813812256, "rewards/reward_fn/std": 0.1858520358800888, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1102.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 408.5625, "completions/mean_terminated_length": 408.5625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.13217354407552773, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.028883446706458926, "learning_rate": 7.5019999999999995e-06, "loss": 0.0568, "num_tokens": 57820825.0, "reward": 2.7001430988311768, "reward_std": 0.3382115960121155, "rewards/reward_fn/mean": 2.7001430988311768, "rewards/reward_fn/std": 0.33821165561676025, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1814.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 494.78125, "completions/mean_terminated_length": 494.78125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.13227962236130264, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.018464128370396793, "learning_rate": 7.5015999999999995e-06, "loss": -0.1013, "num_tokens": 57858418.0, "reward": 3.8081459999084473, "reward_std": 0.5380722284317017, "rewards/reward_fn/mean": 3.8081459999084473, "rewards/reward_fn/std": 0.5380722284317017, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 102.0, "completions/max_terminated_length": 102.0, "completions/mean_length": 79.84375, "completions/mean_terminated_length": 79.84375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.13238570064707755, "frac_reward_zero_std": 1.0, "grad_norm": 0.1943359375, "kl": 0.02420707419514656, "learning_rate": 7.5011999999999994e-06, "loss": 0.001, "num_tokens": 57898733.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1284.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 336.15625, "completions/mean_terminated_length": 336.15625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.13249177893285244, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.02854512631893158, "learning_rate": 7.500799999999999e-06, "loss": -0.0838, "num_tokens": 57938002.0, "reward": 2.6425585746765137, "reward_std": 0.25894415378570557, "rewards/reward_fn/mean": 2.6425585746765137, "rewards/reward_fn/std": 0.25894415378570557, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 294.5, "completions/mean_terminated_length": 294.5, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.13259785721862735, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.02842319617047906, "learning_rate": 7.500399999999999e-06, "loss": 0.063, "num_tokens": 57993954.0, "reward": 3.9350175857543945, "reward_std": 0.2557204067707062, "rewards/reward_fn/mean": 3.9350175857543945, "rewards/reward_fn/std": 0.2557204067707062, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 103.03125, "completions/mean_terminated_length": 103.03125, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.13270393550440224, "frac_reward_zero_std": 0.0, "grad_norm": 3.046875, "kl": 0.02735895407386124, "learning_rate": 7.499999999999999e-06, "loss": 0.0611, "num_tokens": 58034563.0, "reward": 3.108828544616699, "reward_std": 0.05664081871509552, "rewards/reward_fn/mean": 3.108828544616699, "rewards/reward_fn/std": 0.056640833616256714, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 227.5625, "completions/mean_terminated_length": 227.5625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.13281001379017715, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.027604984818026423, "learning_rate": 7.499599999999999e-06, "loss": 0.1806, "num_tokens": 58079445.0, "reward": 3.8578529357910156, "reward_std": 0.475358247756958, "rewards/reward_fn/mean": 3.8578529357910156, "rewards/reward_fn/std": 0.4753582775592804, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 311.09375, "completions/mean_terminated_length": 311.09375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.13291609207595206, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.01987571455538273, "learning_rate": 7.4992e-06, "loss": 0.0234, "num_tokens": 58131000.0, "reward": 3.4816336631774902, "reward_std": 0.716206431388855, "rewards/reward_fn/mean": 3.4816336631774902, "rewards/reward_fn/std": 0.716206431388855, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1183.0, "completions/max_terminated_length": 1183.0, "completions/mean_length": 362.34375, "completions/mean_terminated_length": 362.34375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.13302217036172695, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.027468676446005702, "learning_rate": 7.4988e-06, "loss": 0.014, "num_tokens": 58186275.0, "reward": 3.3233084678649902, "reward_std": 0.533751368522644, "rewards/reward_fn/mean": 3.3233084678649902, "rewards/reward_fn/std": 0.533751368522644, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 285.59375, "completions/mean_terminated_length": 285.59375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.13312824864750186, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.02612379170022905, "learning_rate": 7.4984e-06, "loss": -0.0235, "num_tokens": 58236950.0, "reward": 3.6018028259277344, "reward_std": 0.6388559341430664, "rewards/reward_fn/mean": 3.6018028259277344, "rewards/reward_fn/std": 0.6388558745384216, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 781.0, "completions/max_terminated_length": 781.0, "completions/mean_length": 199.28125, "completions/mean_terminated_length": 199.28125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.13323432693327675, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.027978347148746252, "learning_rate": 7.498e-06, "loss": 0.0033, "num_tokens": 58273119.0, "reward": 2.9400501251220703, "reward_std": 0.0498286671936512, "rewards/reward_fn/mean": 2.9400501251220703, "rewards/reward_fn/std": 0.04982864111661911, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 166.90625, "completions/mean_terminated_length": 166.90625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.13334040521905166, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.027146222535520792, "learning_rate": 7.4976e-06, "loss": -0.0098, "num_tokens": 58309596.0, "reward": 2.928208112716675, "reward_std": 0.5002910494804382, "rewards/reward_fn/mean": 2.928208112716675, "rewards/reward_fn/std": 0.5002910494804382, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 172.46875, "completions/mean_terminated_length": 172.46875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.13344648350482657, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.026685474440455437, "learning_rate": 7.4972e-06, "loss": 0.1586, "num_tokens": 58352171.0, "reward": 3.931187629699707, "reward_std": 0.3892618715763092, "rewards/reward_fn/mean": 3.931187629699707, "rewards/reward_fn/std": 0.3892618715763092, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 243.625, "completions/mean_terminated_length": 243.625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.13355256179060146, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.0268127650488168, "learning_rate": 7.4968e-06, "loss": -0.1437, "num_tokens": 58390175.0, "reward": 3.875, "reward_std": 0.7071067690849304, "rewards/reward_fn/mean": 3.875, "rewards/reward_fn/std": 0.7071067690849304, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 240.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.13365864007637637, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.03086657985113561, "learning_rate": 7.4964e-06, "loss": 0.0395, "num_tokens": 58439379.0, "reward": 3.4151363372802734, "reward_std": 0.5244680643081665, "rewards/reward_fn/mean": 3.4151363372802734, "rewards/reward_fn/std": 0.5244680643081665, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1492.0, "completions/max_terminated_length": 1492.0, "completions/mean_length": 280.65625, "completions/mean_terminated_length": 280.65625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.13376471836215126, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.0308777317404747, "learning_rate": 7.496e-06, "loss": 0.0012, "num_tokens": 58485320.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 236.625, "completions/mean_terminated_length": 236.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.13387079664792617, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.026496544247493148, "learning_rate": 7.495599999999999e-06, "loss": 0.0164, "num_tokens": 58542108.0, "reward": 3.928636074066162, "reward_std": 0.403695672750473, "rewards/reward_fn/mean": 3.928636074066162, "rewards/reward_fn/std": 0.403695672750473, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 192.34375, "completions/mean_terminated_length": 192.34375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.13397687493370108, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.02878828765824437, "learning_rate": 7.495199999999999e-06, "loss": 0.0331, "num_tokens": 58591783.0, "reward": 3.8793630599975586, "reward_std": 0.3811015188694, "rewards/reward_fn/mean": 3.8793630599975586, "rewards/reward_fn/std": 0.3811015486717224, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 282.125, "completions/mean_terminated_length": 282.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.13408295321947597, "frac_reward_zero_std": 1.0, "grad_norm": 0.1142578125, "kl": 0.024802331812679768, "learning_rate": 7.494799999999999e-06, "loss": 0.001, "num_tokens": 58633995.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 201.53125, "completions/mean_terminated_length": 201.53125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.13418903150525088, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.03025827999226749, "learning_rate": 7.4944e-06, "loss": 0.0333, "num_tokens": 58664124.0, "reward": 3.811546564102173, "reward_std": 0.534115731716156, "rewards/reward_fn/mean": 3.811546564102173, "rewards/reward_fn/std": 0.534115731716156, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 377.78125, "completions/mean_terminated_length": 377.78125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.13429510979102577, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.03353354521095753, "learning_rate": 7.494e-06, "loss": 0.0983, "num_tokens": 58715669.0, "reward": 2.4701292514801025, "reward_std": 0.6296804547309875, "rewards/reward_fn/mean": 2.4701292514801025, "rewards/reward_fn/std": 0.6296803951263428, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 308.3125, "completions/mean_terminated_length": 308.3125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.13440118807680068, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.026489258743822575, "learning_rate": 7.4936e-06, "loss": 0.0136, "num_tokens": 58760319.0, "reward": 3.2096259593963623, "reward_std": 0.4842040240764618, "rewards/reward_fn/mean": 3.2096259593963623, "rewards/reward_fn/std": 0.4842039942741394, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 148.53125, "completions/mean_terminated_length": 148.53125, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.13450726636257557, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.02966041606850922, "learning_rate": 7.4932e-06, "loss": 0.0012, "num_tokens": 58790704.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 283.28125, "completions/mean_terminated_length": 283.28125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.13461334464835048, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.030821983935311437, "learning_rate": 7.4928e-06, "loss": -0.0253, "num_tokens": 58820473.0, "reward": 3.7869110107421875, "reward_std": 0.49617645144462585, "rewards/reward_fn/mean": 3.7869110107421875, "rewards/reward_fn/std": 0.49617645144462585, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 303.4375, "completions/mean_terminated_length": 303.4375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.1347194229341254, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.01821422518696636, "learning_rate": 7.4924e-06, "loss": 0.0152, "num_tokens": 58866439.0, "reward": 2.632956027984619, "reward_std": 0.3445678651332855, "rewards/reward_fn/mean": 2.632956027984619, "rewards/reward_fn/std": 0.3445678651332855, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 402.5625, "completions/mean_terminated_length": 402.5625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.13482550121990028, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.02504706336185336, "learning_rate": 7.492e-06, "loss": -0.0248, "num_tokens": 58941273.0, "reward": 3.4776172637939453, "reward_std": 0.641873836517334, "rewards/reward_fn/mean": 3.4776172637939453, "rewards/reward_fn/std": 0.641873836517334, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 144.8125, "completions/mean_terminated_length": 144.8125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.1349315795056752, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.02622751286253333, "learning_rate": 7.4915999999999996e-06, "loss": 0.001, "num_tokens": 58969779.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 367.40625, "completions/mean_terminated_length": 367.40625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.13503765779145008, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.024884506594389677, "learning_rate": 7.4911999999999995e-06, "loss": -0.0294, "num_tokens": 59026048.0, "reward": 2.75225830078125, "reward_std": 0.27699360251426697, "rewards/reward_fn/mean": 2.75225830078125, "rewards/reward_fn/std": 0.27699360251426697, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 177.15625, "completions/mean_terminated_length": 177.15625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.135143736077225, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.02919724863022566, "learning_rate": 7.4907999999999995e-06, "loss": 0.0012, "num_tokens": 59062149.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 512.53125, "completions/mean_terminated_length": 512.53125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.1352498143629999, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.017439111368730664, "learning_rate": 7.4903999999999995e-06, "loss": -0.0313, "num_tokens": 59114102.0, "reward": 2.5705909729003906, "reward_std": 0.33944466710090637, "rewards/reward_fn/mean": 2.5705909729003906, "rewards/reward_fn/std": 0.33944469690322876, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1666.0, "completions/max_terminated_length": 1666.0, "completions/mean_length": 330.25, "completions/mean_terminated_length": 330.25, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.1353558926487748, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.025731251807883382, "learning_rate": 7.49e-06, "loss": 0.0415, "num_tokens": 59176222.0, "reward": 3.0980734825134277, "reward_std": 0.9913315176963806, "rewards/reward_fn/mean": 3.0980734825134277, "rewards/reward_fn/std": 0.9913315176963806, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 137.84375, "completions/mean_terminated_length": 137.84375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.1354619709345497, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.025162109872326255, "learning_rate": 7.4896e-06, "loss": 0.001, "num_tokens": 59223577.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 290.25, "completions/mean_terminated_length": 290.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.1355680492203246, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.024209644412621856, "learning_rate": 7.4892e-06, "loss": 0.0241, "num_tokens": 59251009.0, "reward": 2.991244077682495, "reward_std": 0.04524612799286842, "rewards/reward_fn/mean": 2.991244077682495, "rewards/reward_fn/std": 0.04524614289402962, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.1356741275060995, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.023455350194126368, "learning_rate": 7.4888e-06, "loss": 0.0716, "num_tokens": 59296689.0, "reward": 3.328552722930908, "reward_std": 0.5668790936470032, "rewards/reward_fn/mean": 3.328552722930908, "rewards/reward_fn/std": 0.5668790936470032, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 204.96875, "completions/mean_terminated_length": 204.96875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.13578020579187441, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.024695158703252673, "learning_rate": 7.4884e-06, "loss": 0.0307, "num_tokens": 59339440.0, "reward": 3.457145929336548, "reward_std": 0.5878257751464844, "rewards/reward_fn/mean": 3.457145929336548, "rewards/reward_fn/std": 0.5878257155418396, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 205.71875, "completions/mean_terminated_length": 205.71875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.1358862840776493, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.025261733680963516, "learning_rate": 7.488e-06, "loss": -0.0178, "num_tokens": 59368871.0, "reward": 2.85054349899292, "reward_std": 0.06259602308273315, "rewards/reward_fn/mean": 2.85054349899292, "rewards/reward_fn/std": 0.06259601563215256, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 172.75, "completions/mean_terminated_length": 172.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.1359923623634242, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.02319705649279058, "learning_rate": 7.487599999999999e-06, "loss": 0.038, "num_tokens": 59405055.0, "reward": 3.5303797721862793, "reward_std": 0.5783197283744812, "rewards/reward_fn/mean": 3.5303797721862793, "rewards/reward_fn/std": 0.5783197283744812, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 354.125, "completions/mean_terminated_length": 354.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.1360984406491991, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.03816063771955669, "learning_rate": 7.487199999999999e-06, "loss": 0.0339, "num_tokens": 59451427.0, "reward": 2.811558723449707, "reward_std": 1.0576351881027222, "rewards/reward_fn/mean": 2.811558723449707, "rewards/reward_fn/std": 1.0576351881027222, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 219.96875, "completions/mean_terminated_length": 219.96875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.136204518934974, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.018938904628157616, "learning_rate": 7.486799999999999e-06, "loss": 0.0891, "num_tokens": 59491266.0, "reward": 3.9295401573181152, "reward_std": 0.3985815644264221, "rewards/reward_fn/mean": 3.9295401573181152, "rewards/reward_fn/std": 0.39858150482177734, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 205.3125, "completions/mean_terminated_length": 205.3125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.13631059722074892, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.025979549856856465, "learning_rate": 7.486399999999999e-06, "loss": 0.1455, "num_tokens": 59569580.0, "reward": 3.9671754837036133, "reward_std": 0.1856841892004013, "rewards/reward_fn/mean": 3.9671754837036133, "rewards/reward_fn/std": 0.1856841742992401, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 118.34375, "completions/mean_terminated_length": 118.34375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.1364166755065238, "frac_reward_zero_std": 1.0, "grad_norm": 0.123046875, "kl": 0.025476978393271565, "learning_rate": 7.485999999999999e-06, "loss": 0.001, "num_tokens": 59606135.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1040.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 259.28125, "completions/mean_terminated_length": 259.28125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.13652275379229872, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.028418211033567786, "learning_rate": 7.485599999999999e-06, "loss": 0.1128, "num_tokens": 59657664.0, "reward": 3.1655662059783936, "reward_std": 0.32365211844444275, "rewards/reward_fn/mean": 3.1655662059783936, "rewards/reward_fn/std": 0.32365208864212036, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 261.03125, "completions/mean_terminated_length": 261.03125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.1366288320780736, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.027075995225459337, "learning_rate": 7.4852e-06, "loss": 0.0743, "num_tokens": 59699745.0, "reward": 3.0787363052368164, "reward_std": 0.5415990948677063, "rewards/reward_fn/mean": 3.0787363052368164, "rewards/reward_fn/std": 0.5415990948677063, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 228.78125, "completions/mean_terminated_length": 228.78125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.13673491036384852, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.0208129589445889, "learning_rate": 7.4848e-06, "loss": 0.0565, "num_tokens": 59754874.0, "reward": 2.737701177597046, "reward_std": 0.17712976038455963, "rewards/reward_fn/mean": 2.737701177597046, "rewards/reward_fn/std": 0.17712976038455963, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 359.625, "completions/mean_terminated_length": 359.625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.13684098864962344, "frac_reward_zero_std": 1.0, "grad_norm": 0.060791015625, "kl": 0.019323077285662293, "learning_rate": 7.4844e-06, "loss": 0.0008, "num_tokens": 59804078.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 244.53125, "completions/mean_terminated_length": 244.53125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.13694706693539832, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.02193894237279892, "learning_rate": 7.484e-06, "loss": -0.012, "num_tokens": 59849343.0, "reward": 3.662727117538452, "reward_std": 0.8044856190681458, "rewards/reward_fn/mean": 3.662727117538452, "rewards/reward_fn/std": 0.804485559463501, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 101.15625, "completions/mean_terminated_length": 101.15625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.13705314522117323, "frac_reward_zero_std": 0.0, "grad_norm": 2.921875, "kl": 0.024025865364819765, "learning_rate": 7.4836e-06, "loss": 0.083, "num_tokens": 59890820.0, "reward": 3.874630928039551, "reward_std": 0.3960571885108948, "rewards/reward_fn/mean": 3.874630928039551, "rewards/reward_fn/std": 0.3960571885108948, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 192.78125, "completions/mean_terminated_length": 192.78125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.13715922350694812, "frac_reward_zero_std": 1.0, "grad_norm": 0.1064453125, "kl": 0.022730932221747935, "learning_rate": 7.4832e-06, "loss": 0.0009, "num_tokens": 59930685.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 175.3125, "completions/mean_terminated_length": 175.3125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.13726530179272303, "frac_reward_zero_std": 1.0, "grad_norm": 0.10595703125, "kl": 0.024798734579235315, "learning_rate": 7.4828e-06, "loss": 0.001, "num_tokens": 59966855.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 208.1875, "completions/mean_terminated_length": 208.1875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.13737138007849792, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.022808075416833162, "learning_rate": 7.4824e-06, "loss": 0.0435, "num_tokens": 60016717.0, "reward": 1.9026248455047607, "reward_std": 0.4392467141151428, "rewards/reward_fn/mean": 1.9026248455047607, "rewards/reward_fn/std": 0.43924665451049805, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1085.0, "completions/max_terminated_length": 1085.0, "completions/mean_length": 309.0625, "completions/mean_terminated_length": 309.0625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.13747745836427283, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.023748031351715326, "learning_rate": 7.482e-06, "loss": 0.0011, "num_tokens": 60077295.0, "reward": 3.4045333862304688, "reward_std": 0.6057592630386353, "rewards/reward_fn/mean": 3.4045333862304688, "rewards/reward_fn/std": 0.60575932264328, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 183.625, "completions/mean_terminated_length": 183.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.13758353665004774, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.03300858614966273, "learning_rate": 7.4816e-06, "loss": 0.0822, "num_tokens": 60100131.0, "reward": 2.8976831436157227, "reward_std": 0.07260072231292725, "rewards/reward_fn/mean": 2.8976831436157227, "rewards/reward_fn/std": 0.07260074466466904, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1612.0, "completions/max_terminated_length": 1612.0, "completions/mean_length": 478.875, "completions/mean_terminated_length": 478.875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.13768961493582263, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.02038547326810658, "learning_rate": 7.4812e-06, "loss": 0.0507, "num_tokens": 60155679.0, "reward": 2.7392239570617676, "reward_std": 0.3335033655166626, "rewards/reward_fn/mean": 2.7392239570617676, "rewards/reward_fn/std": 0.3335033059120178, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 351.28125, "completions/mean_terminated_length": 351.28125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.13779569322159754, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.017921574064530432, "learning_rate": 7.4808e-06, "loss": -0.0526, "num_tokens": 60211464.0, "reward": 2.7742135524749756, "reward_std": 0.037488196045160294, "rewards/reward_fn/mean": 2.7742135524749756, "rewards/reward_fn/std": 0.037488240748643875, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 391.25, "completions/mean_terminated_length": 391.25, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.13790177150737243, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.022719235508702695, "learning_rate": 7.4804e-06, "loss": 0.0432, "num_tokens": 60258448.0, "reward": 3.5938987731933594, "reward_std": 0.6974779367446899, "rewards/reward_fn/mean": 3.5938987731933594, "rewards/reward_fn/std": 0.6974778771400452, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 156.0, "completions/max_terminated_length": 156.0, "completions/mean_length": 94.6875, "completions/mean_terminated_length": 94.6875, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.13800784979314734, "frac_reward_zero_std": 1.0, "grad_norm": 0.1533203125, "kl": 0.027633204823359847, "learning_rate": 7.48e-06, "loss": 0.0011, "num_tokens": 60320838.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 201.4375, "completions/mean_terminated_length": 201.4375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.13811392807892225, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.029265858931466937, "learning_rate": 7.4795999999999995e-06, "loss": 0.0012, "num_tokens": 60363828.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 116.25, "completions/mean_terminated_length": 116.25, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.13822000636469714, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.026726818876340985, "learning_rate": 7.4791999999999995e-06, "loss": 0.0011, "num_tokens": 60398460.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 111.375, "completions/mean_terminated_length": 111.375, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.13832608465047205, "frac_reward_zero_std": 1.0, "grad_norm": 0.12353515625, "kl": 0.02441283850930631, "learning_rate": 7.4787999999999994e-06, "loss": 0.001, "num_tokens": 60439816.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 100.9375, "completions/mean_terminated_length": 100.9375, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.13843216293624694, "frac_reward_zero_std": 0.0, "grad_norm": 3.515625, "kl": 0.03212942089885473, "learning_rate": 7.478399999999999e-06, "loss": -0.0653, "num_tokens": 60477830.0, "reward": 2.844021797180176, "reward_std": 0.21652944386005402, "rewards/reward_fn/mean": 2.844021797180176, "rewards/reward_fn/std": 0.21652939915657043, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 373.34375, "completions/mean_terminated_length": 373.34375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.13853824122202185, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.02917315112426877, "learning_rate": 7.477999999999999e-06, "loss": -0.0192, "num_tokens": 60510897.0, "reward": 3.6083593368530273, "reward_std": 0.5928052067756653, "rewards/reward_fn/mean": 3.6083593368530273, "rewards/reward_fn/std": 0.5928052067756653, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1326.0, "completions/mean_length": 453.25, "completions/mean_terminated_length": 401.8064270019531, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.13864431950779676, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.02317165257409215, "learning_rate": 7.477599999999999e-06, "loss": 0.1673, "num_tokens": 60555865.0, "reward": 2.9343154430389404, "reward_std": 0.6307179927825928, "rewards/reward_fn/mean": 2.9343154430389404, "rewards/reward_fn/std": 0.6307179927825928, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1059.0, "completions/max_terminated_length": 1059.0, "completions/mean_length": 324.1875, "completions/mean_terminated_length": 324.1875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.13875039779357165, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.027127522975206375, "learning_rate": 7.477199999999999e-06, "loss": 0.0713, "num_tokens": 60586719.0, "reward": 2.7872352600097656, "reward_std": 0.6089009642601013, "rewards/reward_fn/mean": 2.7872352600097656, "rewards/reward_fn/std": 0.6089009642601013, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1908.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 667.625, "completions/mean_terminated_length": 667.625, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.13885647607934656, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.01850100071169436, "learning_rate": 7.476799999999999e-06, "loss": 0.0534, "num_tokens": 60646515.0, "reward": 2.3431787490844727, "reward_std": 0.45147812366485596, "rewards/reward_fn/mean": 2.3431787490844727, "rewards/reward_fn/std": 0.45147812366485596, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 328.8125, "completions/mean_terminated_length": 328.8125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.13896255436512145, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.020504693733528256, "learning_rate": 7.476399999999999e-06, "loss": 0.0613, "num_tokens": 60691949.0, "reward": 1.6473358869552612, "reward_std": 0.04363499581813812, "rewards/reward_fn/mean": 1.6473358869552612, "rewards/reward_fn/std": 0.04363495483994484, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 226.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.13906863265089636, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.02825233223848045, "learning_rate": 7.475999999999999e-06, "loss": -0.0526, "num_tokens": 60732285.0, "reward": 2.9058597087860107, "reward_std": 0.06823138147592545, "rewards/reward_fn/mean": 2.9058597087860107, "rewards/reward_fn/std": 0.06823134422302246, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 247.03125, "completions/mean_terminated_length": 247.03125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.13917471093667128, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.021199521608650684, "learning_rate": 7.4756e-06, "loss": 0.1534, "num_tokens": 60771326.0, "reward": 2.8149824142456055, "reward_std": 0.04035944491624832, "rewards/reward_fn/mean": 2.8149824142456055, "rewards/reward_fn/std": 0.04035947099328041, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 220.71875, "completions/mean_terminated_length": 220.71875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.13928078922244616, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.020940072368830442, "learning_rate": 7.4752e-06, "loss": 0.0157, "num_tokens": 60827989.0, "reward": 2.911080837249756, "reward_std": 0.20522548258304596, "rewards/reward_fn/mean": 2.911080837249756, "rewards/reward_fn/std": 0.20522546768188477, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 196.90625, "completions/mean_terminated_length": 196.90625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.13938686750822107, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.030910142930224538, "learning_rate": 7.4748e-06, "loss": -0.0081, "num_tokens": 60851026.0, "reward": 3.170276403427124, "reward_std": 0.3668881952762604, "rewards/reward_fn/mean": 3.170276403427124, "rewards/reward_fn/std": 0.3668881952762604, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 275.0, "completions/mean_terminated_length": 275.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.13949294579399596, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.025190659100189805, "learning_rate": 7.4744e-06, "loss": 0.0444, "num_tokens": 60897106.0, "reward": 2.8724751472473145, "reward_std": 0.056788910180330276, "rewards/reward_fn/mean": 2.8724751472473145, "rewards/reward_fn/std": 0.05678891763091087, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 426.65625, "completions/mean_terminated_length": 374.3548278808594, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.13959902407977087, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.02236508228816092, "learning_rate": 7.474e-06, "loss": 0.1468, "num_tokens": 60950503.0, "reward": 2.9462509155273438, "reward_std": 0.50715571641922, "rewards/reward_fn/mean": 2.9462509155273438, "rewards/reward_fn/std": 0.5071556568145752, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 139.5625, "completions/mean_terminated_length": 139.5625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.13970510236554579, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.03153642802499235, "learning_rate": 7.4736e-06, "loss": 0.0013, "num_tokens": 60990713.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 201.34375, "completions/mean_terminated_length": 201.34375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.13981118065132067, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.029545513913035393, "learning_rate": 7.4732e-06, "loss": 0.0299, "num_tokens": 61044836.0, "reward": 2.9463093280792236, "reward_std": 0.05201994255185127, "rewards/reward_fn/mean": 2.9463093280792236, "rewards/reward_fn/std": 0.052019957453012466, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 283.15625, "completions/mean_terminated_length": 283.15625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.13991725893709558, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.024044741643592715, "learning_rate": 7.4728e-06, "loss": -0.0858, "num_tokens": 61075273.0, "reward": 2.9099972248077393, "reward_std": 0.876579761505127, "rewards/reward_fn/mean": 2.9099972248077393, "rewards/reward_fn/std": 0.8765797019004822, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 148.71875, "completions/mean_terminated_length": 148.71875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.14002333722287047, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.015515311155468225, "learning_rate": 7.4724e-06, "loss": 0.0582, "num_tokens": 61122720.0, "reward": 2.7426884174346924, "reward_std": 0.03458679839968681, "rewards/reward_fn/mean": 2.7426884174346924, "rewards/reward_fn/std": 0.034586794674396515, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 128.75, "completions/mean_terminated_length": 128.75, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.14012941550864538, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.02550445985980332, "learning_rate": 7.472e-06, "loss": 0.001, "num_tokens": 61180632.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.14023549379442027, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.031397337559610605, "learning_rate": 7.471599999999999e-06, "loss": 0.0942, "num_tokens": 61231660.0, "reward": 3.165767192840576, "reward_std": 0.5331368446350098, "rewards/reward_fn/mean": 3.165767192840576, "rewards/reward_fn/std": 0.533136785030365, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 357.84375, "completions/mean_terminated_length": 357.84375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.14034157208019518, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.02645385661162436, "learning_rate": 7.471199999999999e-06, "loss": 0.0052, "num_tokens": 61274439.0, "reward": 3.297321319580078, "reward_std": 1.0589929819107056, "rewards/reward_fn/mean": 3.297321319580078, "rewards/reward_fn/std": 1.0589929819107056, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 326.40625, "completions/mean_terminated_length": 326.40625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.1404476503659701, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.026967764366418123, "learning_rate": 7.4708e-06, "loss": -0.0404, "num_tokens": 61319092.0, "reward": 3.5227808952331543, "reward_std": 0.747248113155365, "rewards/reward_fn/mean": 3.5227808952331543, "rewards/reward_fn/std": 0.747248113155365, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1725.0, "completions/max_terminated_length": 1725.0, "completions/mean_length": 355.0, "completions/mean_terminated_length": 355.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.14055372865174498, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.03049283567816019, "learning_rate": 7.4704e-06, "loss": 0.1093, "num_tokens": 61363092.0, "reward": 3.722357749938965, "reward_std": 0.5337069034576416, "rewards/reward_fn/mean": 3.722357749938965, "rewards/reward_fn/std": 0.5337069034576416, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 112.28125, "completions/mean_terminated_length": 112.28125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.1406598069375199, "frac_reward_zero_std": 0.0, "grad_norm": 2.890625, "kl": 0.021817383356392384, "learning_rate": 7.47e-06, "loss": -0.0305, "num_tokens": 61409597.0, "reward": 3.9790706634521484, "reward_std": 0.11839355528354645, "rewards/reward_fn/mean": 3.9790706634521484, "rewards/reward_fn/std": 0.11839357018470764, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 197.15625, "completions/mean_terminated_length": 197.15625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.14076588522329478, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.028691569808870554, "learning_rate": 7.4696e-06, "loss": 0.0097, "num_tokens": 61445026.0, "reward": 3.0395777225494385, "reward_std": 0.4235535264015198, "rewards/reward_fn/mean": 3.0395777225494385, "rewards/reward_fn/std": 0.4235535264015198, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 803.0, "completions/max_terminated_length": 803.0, "completions/mean_length": 205.90625, "completions/mean_terminated_length": 205.90625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.1408719635090697, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.024880185024812818, "learning_rate": 7.4691999999999996e-06, "loss": -0.0302, "num_tokens": 61468447.0, "reward": 3.9752869606018066, "reward_std": 0.1397986114025116, "rewards/reward_fn/mean": 3.9752869606018066, "rewards/reward_fn/std": 0.1397986114025116, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 193.125, "completions/mean_terminated_length": 193.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.1409780417948446, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.03259215364232659, "learning_rate": 7.4687999999999995e-06, "loss": 0.0013, "num_tokens": 61505507.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 284.15625, "completions/mean_terminated_length": 284.15625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.1410841200806195, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.0419064718298614, "learning_rate": 7.4683999999999995e-06, "loss": -0.0339, "num_tokens": 61549448.0, "reward": 2.5116405487060547, "reward_std": 0.5236536860466003, "rewards/reward_fn/mean": 2.5116405487060547, "rewards/reward_fn/std": 0.5236537456512451, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 153.84375, "completions/mean_terminated_length": 153.84375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.1411901983663944, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.025083963526412845, "learning_rate": 7.4679999999999995e-06, "loss": -0.0651, "num_tokens": 61573539.0, "reward": 3.4854817390441895, "reward_std": 0.4917638599872589, "rewards/reward_fn/mean": 3.4854817390441895, "rewards/reward_fn/std": 0.4917638599872589, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 112.65625, "completions/mean_terminated_length": 112.65625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.1412962766521693, "frac_reward_zero_std": 1.0, "grad_norm": 0.1845703125, "kl": 0.02348939247895032, "learning_rate": 7.4675999999999994e-06, "loss": 0.0009, "num_tokens": 61641112.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1583.0, "completions/max_terminated_length": 1583.0, "completions/mean_length": 600.40625, "completions/mean_terminated_length": 600.40625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.1414023549379442, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.020444039721041918, "learning_rate": 7.467199999999999e-06, "loss": -0.0601, "num_tokens": 61695973.0, "reward": 3.559945583343506, "reward_std": 0.6751914620399475, "rewards/reward_fn/mean": 3.559945583343506, "rewards/reward_fn/std": 0.6751914620399475, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 402.90625, "completions/mean_terminated_length": 402.90625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.14150843322371912, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.022967512253671885, "learning_rate": 7.466799999999999e-06, "loss": 0.0097, "num_tokens": 61746978.0, "reward": 3.1534409523010254, "reward_std": 0.715872049331665, "rewards/reward_fn/mean": 3.1534409523010254, "rewards/reward_fn/std": 0.7158719897270203, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 230.6875, "completions/mean_terminated_length": 230.6875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.141614511509494, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.031314355321228504, "learning_rate": 7.4664e-06, "loss": 0.0287, "num_tokens": 61796344.0, "reward": 3.0475525856018066, "reward_std": 0.18288551270961761, "rewards/reward_fn/mean": 3.0475525856018066, "rewards/reward_fn/std": 0.18288545310497284, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 165.125, "completions/mean_terminated_length": 165.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.14172058979526891, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.021358093596063554, "learning_rate": 7.466e-06, "loss": 0.0009, "num_tokens": 61823100.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 483.21875, "completions/mean_terminated_length": 483.21875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.1418266680810438, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.02461231197230518, "learning_rate": 7.4656e-06, "loss": -0.0173, "num_tokens": 61868451.0, "reward": 3.531749725341797, "reward_std": 0.7565776705741882, "rewards/reward_fn/mean": 3.531749725341797, "rewards/reward_fn/std": 0.7565776705741882, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 239.625, "completions/mean_terminated_length": 239.625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.1419327463668187, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.022753535537049174, "learning_rate": 7.4652e-06, "loss": 0.0128, "num_tokens": 61899671.0, "reward": 3.928053855895996, "reward_std": 0.4069896936416626, "rewards/reward_fn/mean": 3.928053855895996, "rewards/reward_fn/std": 0.4069896936416626, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1521.0, "completions/max_terminated_length": 1521.0, "completions/mean_length": 296.0625, "completions/mean_terminated_length": 296.0625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.14203882465259363, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.020033373264595866, "learning_rate": 7.4648e-06, "loss": -0.1692, "num_tokens": 61940665.0, "reward": 3.5547611713409424, "reward_std": 0.5535134077072144, "rewards/reward_fn/mean": 3.5547611713409424, "rewards/reward_fn/std": 0.5535133481025696, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 273.875, "completions/mean_terminated_length": 273.875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.1421449029383685, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.019615008728578687, "learning_rate": 7.4644e-06, "loss": -0.0367, "num_tokens": 61986261.0, "reward": 3.111820697784424, "reward_std": 0.4804832935333252, "rewards/reward_fn/mean": 3.111820697784424, "rewards/reward_fn/std": 0.4804832339286804, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 203.0, "completions/mean_terminated_length": 203.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.14225098122414342, "frac_reward_zero_std": 1.0, "grad_norm": 0.1103515625, "kl": 0.020947684068232775, "learning_rate": 7.464e-06, "loss": 0.0008, "num_tokens": 62027157.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 392.96875, "completions/mean_terminated_length": 392.96875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.1423570595099183, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.021076830103993416, "learning_rate": 7.463599999999999e-06, "loss": 0.0762, "num_tokens": 62076052.0, "reward": 2.854060173034668, "reward_std": 0.03796735033392906, "rewards/reward_fn/mean": 2.854060173034668, "rewards/reward_fn/std": 0.03796736150979996, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1348.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 373.6875, "completions/mean_terminated_length": 373.6875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.14246313779569322, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.01754242356400937, "learning_rate": 7.463199999999999e-06, "loss": 0.1616, "num_tokens": 62128586.0, "reward": 2.9887051582336426, "reward_std": 0.04001903906464577, "rewards/reward_fn/mean": 2.9887051582336426, "rewards/reward_fn/std": 0.04001903906464577, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 348.9375, "completions/mean_terminated_length": 348.9375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.14256921608146814, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.022171859396621585, "learning_rate": 7.462799999999999e-06, "loss": -0.0195, "num_tokens": 62174152.0, "reward": 3.848252296447754, "reward_std": 0.40802672505378723, "rewards/reward_fn/mean": 3.848252296447754, "rewards/reward_fn/std": 0.40802669525146484, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 297.03125, "completions/mean_terminated_length": 297.03125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.14267529436724302, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.02330731856636703, "learning_rate": 7.462399999999999e-06, "loss": 0.1368, "num_tokens": 62215305.0, "reward": 3.5138566493988037, "reward_std": 0.5279030203819275, "rewards/reward_fn/mean": 3.5138566493988037, "rewards/reward_fn/std": 0.5279030203819275, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 197.75, "completions/mean_terminated_length": 197.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.14278137265301794, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.024088377133011818, "learning_rate": 7.461999999999999e-06, "loss": 0.1033, "num_tokens": 62256449.0, "reward": 2.7401795387268066, "reward_std": 0.04600340500473976, "rewards/reward_fn/mean": 2.7401795387268066, "rewards/reward_fn/std": 0.04600339010357857, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 241.09375, "completions/mean_terminated_length": 241.09375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.14288745093879282, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.025438385782763362, "learning_rate": 7.4616e-06, "loss": 0.0087, "num_tokens": 62301764.0, "reward": 2.7687675952911377, "reward_std": 0.29931241273880005, "rewards/reward_fn/mean": 2.7687675952911377, "rewards/reward_fn/std": 0.29931241273880005, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1402.0, "completions/mean_length": 411.78125, "completions/mean_terminated_length": 359.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.14299352922456773, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.02156771393492818, "learning_rate": 7.4612e-06, "loss": 0.297, "num_tokens": 62350557.0, "reward": 2.8675591945648193, "reward_std": 0.6260008215904236, "rewards/reward_fn/mean": 2.8675591945648193, "rewards/reward_fn/std": 0.626000702381134, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 456.625, "completions/mean_terminated_length": 456.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.14309960751034262, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.025939938612282276, "learning_rate": 7.4608e-06, "loss": 0.0314, "num_tokens": 62397009.0, "reward": 2.5841550827026367, "reward_std": 0.6542167067527771, "rewards/reward_fn/mean": 2.5841550827026367, "rewards/reward_fn/std": 0.6542167067527771, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 198.34375, "completions/mean_terminated_length": 198.34375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.14320568579611753, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.030676579335704446, "learning_rate": 7.4604e-06, "loss": 0.0779, "num_tokens": 62434076.0, "reward": 3.877383232116699, "reward_std": 0.29128482937812805, "rewards/reward_fn/mean": 3.877383232116699, "rewards/reward_fn/std": 0.29128485918045044, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 121.3125, "completions/mean_terminated_length": 121.3125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.14331176408189245, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "kl": 0.02987167122773826, "learning_rate": 7.46e-06, "loss": 0.0955, "num_tokens": 62480742.0, "reward": 3.9019250869750977, "reward_std": 0.3102291226387024, "rewards/reward_fn/mean": 3.9019250869750977, "rewards/reward_fn/std": 0.3102291226387024, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 254.5, "completions/mean_terminated_length": 254.5, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.14341784236766733, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.019354867981746793, "learning_rate": 7.4596e-06, "loss": -0.0729, "num_tokens": 62527126.0, "reward": 3.7116589546203613, "reward_std": 0.7752918601036072, "rewards/reward_fn/mean": 3.7116589546203613, "rewards/reward_fn/std": 0.7752918004989624, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 185.0625, "completions/mean_terminated_length": 185.0625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.14352392065344224, "frac_reward_zero_std": 1.0, "grad_norm": 0.228515625, "kl": 0.027313646278344095, "learning_rate": 7.4592e-06, "loss": 0.0011, "num_tokens": 62587512.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 241.75, "completions/mean_terminated_length": 241.75, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.14362999893921713, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.03514736890792847, "learning_rate": 7.4588e-06, "loss": 0.0202, "num_tokens": 62632432.0, "reward": 3.8626348972320557, "reward_std": 0.5405560731887817, "rewards/reward_fn/mean": 3.8626348972320557, "rewards/reward_fn/std": 0.5405560731887817, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 239.65625, "completions/mean_terminated_length": 239.65625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.14373607722499204, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.02487537218257785, "learning_rate": 7.4584e-06, "loss": 0.1213, "num_tokens": 62670565.0, "reward": 2.8300230503082275, "reward_std": 0.05813976749777794, "rewards/reward_fn/mean": 2.8300230503082275, "rewards/reward_fn/std": 0.05813978984951973, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1358.0, "completions/mean_length": 670.15625, "completions/mean_terminated_length": 625.7096557617188, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.14384215551076696, "frac_reward_zero_std": 0.0, "grad_norm": 1.03125, "kl": 0.01770936872344464, "learning_rate": 7.4579999999999996e-06, "loss": 0.0846, "num_tokens": 62727050.0, "reward": 2.2211742401123047, "reward_std": 0.5450964570045471, "rewards/reward_fn/mean": 2.2211742401123047, "rewards/reward_fn/std": 0.5450963973999023, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 405.46875, "completions/mean_terminated_length": 405.46875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.14394823379654184, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.023628632072359324, "learning_rate": 7.4575999999999995e-06, "loss": -0.0418, "num_tokens": 62782905.0, "reward": 2.8233463764190674, "reward_std": 0.1016487330198288, "rewards/reward_fn/mean": 2.8233463764190674, "rewards/reward_fn/std": 0.10164876282215118, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 351.5625, "completions/mean_terminated_length": 351.5625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.14405431208231675, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.018899488961324096, "learning_rate": 7.4571999999999995e-06, "loss": 0.022, "num_tokens": 62821035.0, "reward": 3.8341493606567383, "reward_std": 0.48483148217201233, "rewards/reward_fn/mean": 3.8341493606567383, "rewards/reward_fn/std": 0.48483148217201233, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1431.0, "completions/max_terminated_length": 1431.0, "completions/mean_length": 360.09375, "completions/mean_terminated_length": 360.09375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.14416039036809164, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.023247625678777695, "learning_rate": 7.4568e-06, "loss": -0.0009, "num_tokens": 62881390.0, "reward": 3.9614830017089844, "reward_std": 0.21788454055786133, "rewards/reward_fn/mean": 3.9614830017089844, "rewards/reward_fn/std": 0.2178845852613449, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1867.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 254.59375, "completions/mean_terminated_length": 254.59375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.14426646865386655, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.02458102209493518, "learning_rate": 7.4564e-06, "loss": -0.1313, "num_tokens": 62935649.0, "reward": 2.5300705432891846, "reward_std": 0.38768091797828674, "rewards/reward_fn/mean": 2.5300705432891846, "rewards/reward_fn/std": 0.38768091797828674, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 181.6875, "completions/mean_terminated_length": 181.6875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.14437254693964147, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.018385571893304586, "learning_rate": 7.456e-06, "loss": 0.0007, "num_tokens": 62963543.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1670.0, "completions/max_terminated_length": 1670.0, "completions/mean_length": 371.875, "completions/mean_terminated_length": 371.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.14447862522541635, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.022040294017642736, "learning_rate": 7.455599999999999e-06, "loss": 0.0123, "num_tokens": 63015603.0, "reward": 2.984866142272949, "reward_std": 0.4643891155719757, "rewards/reward_fn/mean": 2.984866142272949, "rewards/reward_fn/std": 0.4643890857696533, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 218.3125, "completions/mean_terminated_length": 218.3125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.14458470351119126, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.02735855709761381, "learning_rate": 7.455199999999999e-06, "loss": 0.0325, "num_tokens": 63057917.0, "reward": 3.982016086578369, "reward_std": 0.1017315685749054, "rewards/reward_fn/mean": 3.982016086578369, "rewards/reward_fn/std": 0.1017315685749054, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1252.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 300.25, "completions/mean_terminated_length": 300.25, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.14469078179696615, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.027118426747620106, "learning_rate": 7.454799999999999e-06, "loss": 0.0445, "num_tokens": 63103237.0, "reward": 2.8830158710479736, "reward_std": 0.4302554130554199, "rewards/reward_fn/mean": 2.8830158710479736, "rewards/reward_fn/std": 0.4302554130554199, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1625.0, "completions/mean_length": 698.09375, "completions/mean_terminated_length": 608.1000366210938, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.14479686008274106, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.02318622707389295, "learning_rate": 7.454399999999999e-06, "loss": 0.2151, "num_tokens": 63185608.0, "reward": 2.6326918601989746, "reward_std": 0.7804985046386719, "rewards/reward_fn/mean": 2.6326918601989746, "rewards/reward_fn/std": 0.7804984450340271, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 341.96875, "completions/mean_terminated_length": 341.96875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.14490293836851598, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.02670998009853065, "learning_rate": 7.453999999999999e-06, "loss": -0.0004, "num_tokens": 63233831.0, "reward": 2.599997043609619, "reward_std": 0.41961222887039185, "rewards/reward_fn/mean": 2.599997043609619, "rewards/reward_fn/std": 0.41961225867271423, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 575.65625, "completions/mean_terminated_length": 575.65625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.14500901665429086, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.02400823961943388, "learning_rate": 7.453599999999999e-06, "loss": -0.0017, "num_tokens": 63285564.0, "reward": 2.339834451675415, "reward_std": 0.6554206013679504, "rewards/reward_fn/mean": 2.339834451675415, "rewards/reward_fn/std": 0.6554206013679504, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 202.46875, "completions/mean_terminated_length": 202.46875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.14511509494006578, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.025389327201992273, "learning_rate": 7.453199999999999e-06, "loss": 0.0224, "num_tokens": 63332843.0, "reward": 3.357534408569336, "reward_std": 0.9463339447975159, "rewards/reward_fn/mean": 3.357534408569336, "rewards/reward_fn/std": 0.9463339447975159, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 259.5625, "completions/mean_terminated_length": 259.5625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.14522117322584066, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.029223348945379257, "learning_rate": 7.452799999999999e-06, "loss": 0.071, "num_tokens": 63381533.0, "reward": 3.966054916381836, "reward_std": 0.19202205538749695, "rewards/reward_fn/mean": 3.966054916381836, "rewards/reward_fn/std": 0.19202204048633575, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 361.6875, "completions/mean_terminated_length": 361.6875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.14532725151161557, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.027676680823788047, "learning_rate": 7.452399999999999e-06, "loss": 0.0042, "num_tokens": 63430131.0, "reward": 3.385025978088379, "reward_std": 0.6906515955924988, "rewards/reward_fn/mean": 3.385025978088379, "rewards/reward_fn/std": 0.690651535987854, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1436.0, "completions/max_terminated_length": 1436.0, "completions/mean_length": 266.1875, "completions/mean_terminated_length": 266.1875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.1454333297973905, "frac_reward_zero_std": 0.0, "grad_norm": 1.0390625, "kl": 0.031021540984511375, "learning_rate": 7.452e-06, "loss": -0.0983, "num_tokens": 63458841.0, "reward": 3.9695372581481934, "reward_std": 0.17232412099838257, "rewards/reward_fn/mean": 3.9695372581481934, "rewards/reward_fn/std": 0.17232413589954376, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 255.28125, "completions/mean_terminated_length": 255.28125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.14553940808316537, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.025465503567829728, "learning_rate": 7.4516e-06, "loss": -0.0179, "num_tokens": 63502786.0, "reward": 2.7831175327301025, "reward_std": 0.0531310960650444, "rewards/reward_fn/mean": 2.7831175327301025, "rewards/reward_fn/std": 0.0531311109662056, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1434.0, "completions/max_terminated_length": 1434.0, "completions/mean_length": 353.78125, "completions/mean_terminated_length": 353.78125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.14564548636894029, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.020204479689709842, "learning_rate": 7.4512e-06, "loss": -0.1409, "num_tokens": 63544891.0, "reward": 2.8671717643737793, "reward_std": 0.0845094546675682, "rewards/reward_fn/mean": 2.8671717643737793, "rewards/reward_fn/std": 0.08450954407453537, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 163.90625, "completions/mean_terminated_length": 163.90625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.14575156465471517, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.028385634068399668, "learning_rate": 7.4508e-06, "loss": 0.0794, "num_tokens": 63568504.0, "reward": 3.8483242988586426, "reward_std": 0.35819515585899353, "rewards/reward_fn/mean": 3.8483242988586426, "rewards/reward_fn/std": 0.35819512605667114, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1143.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 325.21875, "completions/mean_terminated_length": 325.21875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.14585764294049008, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.025249871658161283, "learning_rate": 7.4504e-06, "loss": 0.046, "num_tokens": 63615071.0, "reward": 3.6527295112609863, "reward_std": 0.4878491461277008, "rewards/reward_fn/mean": 3.6527295112609863, "rewards/reward_fn/std": 0.4878491461277008, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1115.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 345.09375, "completions/mean_terminated_length": 345.09375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.14596372122626497, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.028095704270526767, "learning_rate": 7.45e-06, "loss": 0.0772, "num_tokens": 63656162.0, "reward": 3.285367012023926, "reward_std": 0.9010963439941406, "rewards/reward_fn/mean": 3.285367012023926, "rewards/reward_fn/std": 0.9010962843894958, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 288.65625, "completions/mean_terminated_length": 288.65625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.14606979951203988, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.028274354292079806, "learning_rate": 7.4496e-06, "loss": -0.0781, "num_tokens": 63712439.0, "reward": 3.466069459915161, "reward_std": 0.9559970498085022, "rewards/reward_fn/mean": 3.466069459915161, "rewards/reward_fn/std": 0.9559970498085022, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 259.84375, "completions/mean_terminated_length": 259.84375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.1461758777978148, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.027279690839350224, "learning_rate": 7.4492e-06, "loss": 0.0394, "num_tokens": 63773586.0, "reward": 3.8966031074523926, "reward_std": 0.43655630946159363, "rewards/reward_fn/mean": 3.8966031074523926, "rewards/reward_fn/std": 0.436556339263916, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 345.25, "completions/mean_terminated_length": 345.25, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.14628195608358968, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.027204215060919523, "learning_rate": 7.4488e-06, "loss": -0.05, "num_tokens": 63805786.0, "reward": 3.292912721633911, "reward_std": 0.6367733478546143, "rewards/reward_fn/mean": 3.292912721633911, "rewards/reward_fn/std": 0.6367732882499695, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 208.3125, "completions/mean_terminated_length": 208.3125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.1463880343693646, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.029668390285223722, "learning_rate": 7.4484e-06, "loss": -0.0387, "num_tokens": 63848260.0, "reward": 3.8188915252685547, "reward_std": 0.5969631671905518, "rewards/reward_fn/mean": 3.8188915252685547, "rewards/reward_fn/std": 0.5969631671905518, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1634.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 286.4375, "completions/mean_terminated_length": 286.4375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.14649411265513948, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.0279810291249305, "learning_rate": 7.448e-06, "loss": 0.3408, "num_tokens": 63904946.0, "reward": 3.9252352714538574, "reward_std": 0.42293301224708557, "rewards/reward_fn/mean": 3.9252352714538574, "rewards/reward_fn/std": 0.4229329824447632, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.1466001909409144, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.02967662224546075, "learning_rate": 7.4476000000000005e-06, "loss": -0.0416, "num_tokens": 63944174.0, "reward": 2.6933159828186035, "reward_std": 0.2893867492675781, "rewards/reward_fn/mean": 2.6933159828186035, "rewards/reward_fn/std": 0.2893867492675781, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1121.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 368.09375, "completions/mean_terminated_length": 368.09375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.1467062692266893, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.0331441021990031, "learning_rate": 7.4472e-06, "loss": 0.06, "num_tokens": 63986289.0, "reward": 2.744060516357422, "reward_std": 0.042238347232341766, "rewards/reward_fn/mean": 2.744060516357422, "rewards/reward_fn/std": 0.042238280177116394, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 203.78125, "completions/mean_terminated_length": 203.78125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.1468123475124642, "frac_reward_zero_std": 1.0, "grad_norm": 0.44140625, "kl": 0.03312438074499369, "learning_rate": 7.4468e-06, "loss": 0.0013, "num_tokens": 64038122.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 193.25, "completions/mean_terminated_length": 193.25, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.1469184257982391, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.03683407441712916, "learning_rate": 7.4463999999999996e-06, "loss": 0.1946, "num_tokens": 64075122.0, "reward": 2.924485206604004, "reward_std": 0.07657773792743683, "rewards/reward_fn/mean": 2.924485206604004, "rewards/reward_fn/std": 0.07657775282859802, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1439.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 390.125, "completions/mean_terminated_length": 390.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.147024504084014, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.03401468298397958, "learning_rate": 7.4459999999999995e-06, "loss": 0.0587, "num_tokens": 64122710.0, "reward": 3.0861051082611084, "reward_std": 0.702394425868988, "rewards/reward_fn/mean": 3.0861051082611084, "rewards/reward_fn/std": 0.702394425868988, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 278.09375, "completions/mean_terminated_length": 278.09375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.1471305823697889, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.028455683263018727, "learning_rate": 7.4455999999999995e-06, "loss": 0.0261, "num_tokens": 64169593.0, "reward": 3.964691162109375, "reward_std": 0.19973696768283844, "rewards/reward_fn/mean": 3.964691162109375, "rewards/reward_fn/std": 0.19973698258399963, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 174.03125, "completions/mean_terminated_length": 174.03125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.14723666065556382, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.03542444505728781, "learning_rate": 7.4451999999999995e-06, "loss": -0.0303, "num_tokens": 64193594.0, "reward": 3.883316993713379, "reward_std": 0.314423143863678, "rewards/reward_fn/mean": 3.883316993713379, "rewards/reward_fn/std": 0.314423143863678, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 183.5, "completions/mean_terminated_length": 183.5, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.1473427389413387, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.022355019696988165, "learning_rate": 7.4447999999999994e-06, "loss": 0.0009, "num_tokens": 64246154.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 207.96875, "completions/mean_terminated_length": 207.96875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.14744881722711362, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.03179771360009909, "learning_rate": 7.444399999999999e-06, "loss": 0.0461, "num_tokens": 64288105.0, "reward": 3.929905891418457, "reward_std": 0.3965129852294922, "rewards/reward_fn/mean": 3.929905891418457, "rewards/reward_fn/std": 0.3965129852294922, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 160.03125, "completions/mean_terminated_length": 160.03125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.1475548955128885, "frac_reward_zero_std": 1.0, "grad_norm": 0.111328125, "kl": 0.03064539493061602, "learning_rate": 7.443999999999999e-06, "loss": 0.0012, "num_tokens": 64329898.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 347.625, "completions/mean_terminated_length": 347.625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.14766097379866341, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.020091792102903128, "learning_rate": 7.443599999999999e-06, "loss": 0.0558, "num_tokens": 64377246.0, "reward": 2.9814658164978027, "reward_std": 0.2760011851787567, "rewards/reward_fn/mean": 2.9814658164978027, "rewards/reward_fn/std": 0.2760012149810791, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 835.0, "completions/max_terminated_length": 835.0, "completions/mean_length": 332.21875, "completions/mean_terminated_length": 332.21875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.14776705208443833, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.026977359084412456, "learning_rate": 7.443199999999999e-06, "loss": 0.0893, "num_tokens": 64425349.0, "reward": 3.6816189289093018, "reward_std": 0.5174549221992493, "rewards/reward_fn/mean": 3.6816189289093018, "rewards/reward_fn/std": 0.5174549221992493, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 163.125, "completions/mean_terminated_length": 163.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.1478731303702132, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.03630391927435994, "learning_rate": 7.4428e-06, "loss": 0.1482, "num_tokens": 64472041.0, "reward": 2.634312391281128, "reward_std": 0.2737848460674286, "rewards/reward_fn/mean": 2.634312391281128, "rewards/reward_fn/std": 0.273784875869751, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 286.0625, "completions/mean_terminated_length": 286.0625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.14797920865598813, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.033690345007926226, "learning_rate": 7.4424e-06, "loss": 0.1175, "num_tokens": 64514219.0, "reward": 2.9914069175720215, "reward_std": 0.0645083636045456, "rewards/reward_fn/mean": 2.9914069175720215, "rewards/reward_fn/std": 0.064508356153965, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 168.9375, "completions/mean_terminated_length": 168.9375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.148085286941763, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.02721068379469216, "learning_rate": 7.442e-06, "loss": -0.0126, "num_tokens": 64553929.0, "reward": 3.966176986694336, "reward_std": 0.19133220613002777, "rewards/reward_fn/mean": 3.966176986694336, "rewards/reward_fn/std": 0.19133223593235016, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 379.78125, "completions/mean_terminated_length": 379.78125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.14819136522753792, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.022854552837088704, "learning_rate": 7.4416e-06, "loss": 0.105, "num_tokens": 64602178.0, "reward": 3.60275936126709, "reward_std": 0.5985162854194641, "rewards/reward_fn/mean": 3.60275936126709, "rewards/reward_fn/std": 0.5985162854194641, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 244.21875, "completions/mean_terminated_length": 244.21875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.14829744351331284, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.02474433882161975, "learning_rate": 7.4412e-06, "loss": 0.001, "num_tokens": 64664329.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1701.0, "completions/max_terminated_length": 1701.0, "completions/mean_length": 387.8125, "completions/mean_terminated_length": 387.8125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.14840352179908772, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.03332933643832803, "learning_rate": 7.4408e-06, "loss": 0.0906, "num_tokens": 64720035.0, "reward": 3.689403533935547, "reward_std": 0.5056177377700806, "rewards/reward_fn/mean": 3.689403533935547, "rewards/reward_fn/std": 0.5056177377700806, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 173.84375, "completions/mean_terminated_length": 173.84375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.14850960008486264, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.0332620891276747, "learning_rate": 7.4404e-06, "loss": -0.0072, "num_tokens": 64769854.0, "reward": 3.5542819499969482, "reward_std": 0.5167267322540283, "rewards/reward_fn/mean": 3.5542819499969482, "rewards/reward_fn/std": 0.5167266726493835, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 259.8125, "completions/mean_terminated_length": 259.8125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.14861567837063752, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.03940536780282855, "learning_rate": 7.44e-06, "loss": 0.0286, "num_tokens": 64810296.0, "reward": 3.590843915939331, "reward_std": 0.5371227860450745, "rewards/reward_fn/mean": 3.590843915939331, "rewards/reward_fn/std": 0.5371227860450745, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1203.0, "completions/max_terminated_length": 1203.0, "completions/mean_length": 370.21875, "completions/mean_terminated_length": 370.21875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.14872175665641243, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.025106285698711872, "learning_rate": 7.4396e-06, "loss": -0.0491, "num_tokens": 64859839.0, "reward": 3.833078384399414, "reward_std": 0.48529359698295593, "rewards/reward_fn/mean": 3.833078384399414, "rewards/reward_fn/std": 0.48529356718063354, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 468.8125, "completions/mean_terminated_length": 417.8709411621094, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.14882783494218735, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.030068765161558986, "learning_rate": 7.439199999999999e-06, "loss": 0.2232, "num_tokens": 64905561.0, "reward": 3.2034428119659424, "reward_std": 0.8694138526916504, "rewards/reward_fn/mean": 3.2034428119659424, "rewards/reward_fn/std": 0.8694137930870056, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 229.4375, "completions/mean_terminated_length": 229.4375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.14893391322796223, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.028407067991793156, "learning_rate": 7.438799999999999e-06, "loss": -0.0417, "num_tokens": 64933799.0, "reward": 3.6446433067321777, "reward_std": 0.7922115325927734, "rewards/reward_fn/mean": 3.6446433067321777, "rewards/reward_fn/std": 0.7922114729881287, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 278.40625, "completions/mean_terminated_length": 278.40625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.14903999151373715, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.027859984431415796, "learning_rate": 7.438399999999999e-06, "loss": -0.0629, "num_tokens": 64963572.0, "reward": 3.877469062805176, "reward_std": 0.3294479548931122, "rewards/reward_fn/mean": 3.877469062805176, "rewards/reward_fn/std": 0.3294479250907898, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 463.0625, "completions/mean_terminated_length": 463.0625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.14914606979951203, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.03835621988400817, "learning_rate": 7.438e-06, "loss": -0.1281, "num_tokens": 65035190.0, "reward": 2.7208948135375977, "reward_std": 0.7465806007385254, "rewards/reward_fn/mean": 2.7208948135375977, "rewards/reward_fn/std": 0.7465806603431702, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 334.3125, "completions/mean_terminated_length": 334.3125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.14925214808528695, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.022742543602362275, "learning_rate": 7.4376e-06, "loss": 0.0009, "num_tokens": 65065216.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 195.125, "completions/mean_terminated_length": 195.125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.14935822637106183, "frac_reward_zero_std": 1.0, "grad_norm": 0.119140625, "kl": 0.02962300064973533, "learning_rate": 7.4372e-06, "loss": 0.0012, "num_tokens": 65106340.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 135.65625, "completions/mean_terminated_length": 135.65625, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.14946430465683674, "frac_reward_zero_std": 1.0, "grad_norm": 0.1357421875, "kl": 0.031661511631682515, "learning_rate": 7.4368e-06, "loss": 0.0013, "num_tokens": 65160281.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 239.4375, "completions/mean_terminated_length": 239.4375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.14957038294261166, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.033848599530756474, "learning_rate": 7.4364e-06, "loss": 0.1764, "num_tokens": 65201351.0, "reward": 3.1107335090637207, "reward_std": 0.08597031980752945, "rewards/reward_fn/mean": 3.1107335090637207, "rewards/reward_fn/std": 0.08597029000520706, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 237.09375, "completions/mean_terminated_length": 237.09375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.14967646122838654, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.03387490310706198, "learning_rate": 7.436e-06, "loss": -0.0979, "num_tokens": 65241162.0, "reward": 2.9534237384796143, "reward_std": 0.7266772985458374, "rewards/reward_fn/mean": 2.9534237384796143, "rewards/reward_fn/std": 0.7266772389411926, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 288.375, "completions/mean_terminated_length": 288.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.14978253951416146, "frac_reward_zero_std": 1.0, "grad_norm": 0.0849609375, "kl": 0.031444058986380696, "learning_rate": 7.4356e-06, "loss": 0.0013, "num_tokens": 65267030.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 256.375, "completions/mean_terminated_length": 256.375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.14988861779993634, "frac_reward_zero_std": 1.0, "grad_norm": 0.10986328125, "kl": 0.029305062256753445, "learning_rate": 7.4351999999999996e-06, "loss": 0.0012, "num_tokens": 65316098.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 83.40625, "completions/mean_terminated_length": 83.40625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.14999469608571125, "frac_reward_zero_std": 1.0, "grad_norm": 0.1455078125, "kl": 0.02644193370360881, "learning_rate": 7.4347999999999995e-06, "loss": 0.0011, "num_tokens": 65339471.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 366.46875, "completions/mean_terminated_length": 366.46875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.15010077437148617, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.04253972531296313, "learning_rate": 7.4343999999999995e-06, "loss": -0.2152, "num_tokens": 65380158.0, "reward": 2.7263221740722656, "reward_std": 0.6098658442497253, "rewards/reward_fn/mean": 2.7263221740722656, "rewards/reward_fn/std": 0.6098658442497253, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1767.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 599.03125, "completions/mean_terminated_length": 599.03125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.15020685265726105, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.029143204214051366, "learning_rate": 7.4339999999999995e-06, "loss": 0.0824, "num_tokens": 65433343.0, "reward": 3.3289918899536133, "reward_std": 0.602114200592041, "rewards/reward_fn/mean": 3.3289918899536133, "rewards/reward_fn/std": 0.602114200592041, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 218.375, "completions/mean_terminated_length": 218.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.15031293094303597, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.030435963766649365, "learning_rate": 7.4336e-06, "loss": -0.0891, "num_tokens": 65485099.0, "reward": 3.7489049434661865, "reward_std": 0.6941893696784973, "rewards/reward_fn/mean": 3.7489049434661865, "rewards/reward_fn/std": 0.6941893696784973, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 183.28125, "completions/mean_terminated_length": 183.28125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.15041900922881085, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.03691709297709167, "learning_rate": 7.4332e-06, "loss": 0.0015, "num_tokens": 65524468.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 109.375, "completions/mean_terminated_length": 109.375, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.15052508751458576, "frac_reward_zero_std": 1.0, "grad_norm": 0.205078125, "kl": 0.039213865995407104, "learning_rate": 7.4328e-06, "loss": 0.0016, "num_tokens": 65552992.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1900.0, "completions/mean_length": 446.96875, "completions/mean_terminated_length": 395.32257080078125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.15063116580036068, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.022784803761169314, "learning_rate": 7.4324e-06, "loss": 0.3249, "num_tokens": 65605791.0, "reward": 2.73966646194458, "reward_std": 0.5022208094596863, "rewards/reward_fn/mean": 2.73966646194458, "rewards/reward_fn/std": 0.5022208094596863, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 178.9375, "completions/mean_terminated_length": 178.9375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.15073724408613556, "frac_reward_zero_std": 0.0, "grad_norm": 3.53125, "kl": 0.032336236676201224, "learning_rate": 7.432e-06, "loss": 0.1029, "num_tokens": 65642141.0, "reward": 3.7554450035095215, "reward_std": 0.3876785337924957, "rewards/reward_fn/mean": 3.7554450035095215, "rewards/reward_fn/std": 0.38767850399017334, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 208.1875, "completions/mean_terminated_length": 208.1875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.15084332237191048, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.035623660776764154, "learning_rate": 7.4316e-06, "loss": 0.1096, "num_tokens": 65681603.0, "reward": 2.7092933654785156, "reward_std": 0.28102341294288635, "rewards/reward_fn/mean": 2.7092933654785156, "rewards/reward_fn/std": 0.28102338314056396, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 158.34375, "completions/mean_terminated_length": 158.34375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.15094940065768536, "frac_reward_zero_std": 0.0, "grad_norm": 2.40625, "kl": 0.03153680078685284, "learning_rate": 7.431199999999999e-06, "loss": -0.0881, "num_tokens": 65710254.0, "reward": 3.8465654850006104, "reward_std": 0.3623442053794861, "rewards/reward_fn/mean": 3.8465654850006104, "rewards/reward_fn/std": 0.36234423518180847, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1457.0, "completions/mean_length": 714.375, "completions/mean_terminated_length": 576.413818359375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.15105547894346028, "frac_reward_zero_std": 0.0, "grad_norm": 0.9453125, "kl": 0.028179930755868554, "learning_rate": 7.430799999999999e-06, "loss": 0.0505, "num_tokens": 65775770.0, "reward": 2.2804367542266846, "reward_std": 0.8255341053009033, "rewards/reward_fn/mean": 2.2804367542266846, "rewards/reward_fn/std": 0.8255340456962585, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 91.0625, "completions/mean_terminated_length": 91.0625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.1511615572292352, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650390625, "kl": 0.029801467899233103, "learning_rate": 7.430399999999999e-06, "loss": 0.0012, "num_tokens": 65809468.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 283.5, "completions/mean_terminated_length": 283.5, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.15126763551501007, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.02984489407390356, "learning_rate": 7.429999999999999e-06, "loss": -0.0261, "num_tokens": 65834380.0, "reward": 3.668632984161377, "reward_std": 0.538867175579071, "rewards/reward_fn/mean": 3.668632984161377, "rewards/reward_fn/std": 0.5388672351837158, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 238.28125, "completions/mean_terminated_length": 238.28125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.151373713800785, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.023655685363337398, "learning_rate": 7.429599999999999e-06, "loss": 0.1598, "num_tokens": 65875157.0, "reward": 3.9262659549713135, "reward_std": 0.41710224747657776, "rewards/reward_fn/mean": 3.9262659549713135, "rewards/reward_fn/std": 0.41710227727890015, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1091.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 348.1875, "completions/mean_terminated_length": 348.1875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.15147979208655987, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.02328518428839743, "learning_rate": 7.429199999999999e-06, "loss": -0.0487, "num_tokens": 65924859.0, "reward": 3.6566414833068848, "reward_std": 0.5582861304283142, "rewards/reward_fn/mean": 3.6566414833068848, "rewards/reward_fn/std": 0.5582861304283142, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 348.0625, "completions/mean_terminated_length": 348.0625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.15158587037233479, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.022434451850131154, "learning_rate": 7.4288e-06, "loss": 0.0641, "num_tokens": 65965245.0, "reward": 2.7481746673583984, "reward_std": 0.4162106215953827, "rewards/reward_fn/mean": 2.7481746673583984, "rewards/reward_fn/std": 0.4162106215953827, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 133.71875, "completions/mean_terminated_length": 133.71875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.1516919486581097, "frac_reward_zero_std": 1.0, "grad_norm": 0.18359375, "kl": 0.04026283789426088, "learning_rate": 7.4284e-06, "loss": 0.0016, "num_tokens": 66012404.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1083.0, "completions/max_terminated_length": 1083.0, "completions/mean_length": 337.6875, "completions/mean_terminated_length": 337.6875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.15179802694388458, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.026995733845978975, "learning_rate": 7.428e-06, "loss": 0.0028, "num_tokens": 66056682.0, "reward": 3.900752544403076, "reward_std": 0.3138922154903412, "rewards/reward_fn/mean": 3.900752544403076, "rewards/reward_fn/std": 0.3138922154903412, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1176.0, "completions/max_terminated_length": 1176.0, "completions/mean_length": 227.59375, "completions/mean_terminated_length": 227.59375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.1519041052296595, "frac_reward_zero_std": 1.0, "grad_norm": 0.10302734375, "kl": 0.02565616718493402, "learning_rate": 7.4276e-06, "loss": 0.001, "num_tokens": 66096317.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1328.0, "completions/max_terminated_length": 1328.0, "completions/mean_length": 389.125, "completions/mean_terminated_length": 389.125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.15201018351543438, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.023474588757380843, "learning_rate": 7.4272e-06, "loss": 0.1392, "num_tokens": 66153665.0, "reward": 3.253533363342285, "reward_std": 0.5899655818939209, "rewards/reward_fn/mean": 3.253533363342285, "rewards/reward_fn/std": 0.5899655818939209, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 161.28125, "completions/mean_terminated_length": 161.28125, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.1521162618012093, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.03629247797653079, "learning_rate": 7.4268e-06, "loss": 0.0981, "num_tokens": 66189610.0, "reward": 3.0274600982666016, "reward_std": 0.05274336412549019, "rewards/reward_fn/mean": 3.0274600982666016, "rewards/reward_fn/std": 0.05274338647723198, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 248.15625, "completions/mean_terminated_length": 248.15625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.15222234008698418, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.029007033677771688, "learning_rate": 7.4264e-06, "loss": 0.0845, "num_tokens": 66229775.0, "reward": 3.6650195121765137, "reward_std": 0.5066304802894592, "rewards/reward_fn/mean": 3.6650195121765137, "rewards/reward_fn/std": 0.5066304802894592, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 224.375, "completions/mean_terminated_length": 224.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.1523284183727591, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.023438591044396162, "learning_rate": 7.426e-06, "loss": 0.1349, "num_tokens": 66266011.0, "reward": 3.8179659843444824, "reward_std": 0.3869030177593231, "rewards/reward_fn/mean": 3.8179659843444824, "rewards/reward_fn/std": 0.3869030177593231, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 294.5, "completions/mean_terminated_length": 294.5, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.152434496658534, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.023724337574094534, "learning_rate": 7.4256e-06, "loss": -0.069, "num_tokens": 66319435.0, "reward": 3.4863743782043457, "reward_std": 0.593258798122406, "rewards/reward_fn/mean": 3.4863743782043457, "rewards/reward_fn/std": 0.5932587385177612, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1090.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 292.6875, "completions/mean_terminated_length": 292.6875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.1525405749443089, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.033242793986573815, "learning_rate": 7.4252e-06, "loss": 0.0595, "num_tokens": 66366145.0, "reward": 2.89218807220459, "reward_std": 0.46033063530921936, "rewards/reward_fn/mean": 2.89218807220459, "rewards/reward_fn/std": 0.46033063530921936, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1363.0, "completions/max_terminated_length": 1363.0, "completions/mean_length": 487.8125, "completions/mean_terminated_length": 487.8125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.1526466532300838, "frac_reward_zero_std": 0.0, "grad_norm": 5.375, "kl": 0.02517133066430688, "learning_rate": 7.4248e-06, "loss": 0.1089, "num_tokens": 66419227.0, "reward": 2.5610084533691406, "reward_std": 0.344752699136734, "rewards/reward_fn/mean": 2.5610084533691406, "rewards/reward_fn/std": 0.344752699136734, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1321.0, "completions/max_terminated_length": 1321.0, "completions/mean_length": 266.875, "completions/mean_terminated_length": 266.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.1527527315158587, "frac_reward_zero_std": 1.0, "grad_norm": 0.103515625, "kl": 0.03170533524826169, "learning_rate": 7.4244e-06, "loss": 0.0013, "num_tokens": 66465399.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 403.78125, "completions/mean_terminated_length": 403.78125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.1528588098016336, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.03218272537924349, "learning_rate": 7.424e-06, "loss": 0.0298, "num_tokens": 66512848.0, "reward": 2.8888301849365234, "reward_std": 0.0484078973531723, "rewards/reward_fn/mean": 2.8888301849365234, "rewards/reward_fn/std": 0.04840795695781708, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 496.625, "completions/mean_terminated_length": 393.20001220703125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.15296488808740852, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.028265361906960607, "learning_rate": 7.4236e-06, "loss": 0.3739, "num_tokens": 66575492.0, "reward": 3.108246088027954, "reward_std": 1.1263806819915771, "rewards/reward_fn/mean": 3.108246088027954, "rewards/reward_fn/std": 1.1263806819915771, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 208.375, "completions/mean_terminated_length": 208.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.1530709663731834, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.028314963448792696, "learning_rate": 7.4231999999999995e-06, "loss": -0.1091, "num_tokens": 66614960.0, "reward": 3.644209146499634, "reward_std": 0.5789510011672974, "rewards/reward_fn/mean": 3.644209146499634, "rewards/reward_fn/std": 0.5789510011672974, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 217.5625, "completions/mean_terminated_length": 217.5625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.15317704465895832, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.028587039094418287, "learning_rate": 7.4227999999999995e-06, "loss": -0.0666, "num_tokens": 66650338.0, "reward": 3.039794445037842, "reward_std": 0.03578682616353035, "rewards/reward_fn/mean": 3.039794445037842, "rewards/reward_fn/std": 0.035786814987659454, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 164.09375, "completions/mean_terminated_length": 164.09375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.1532831229447332, "frac_reward_zero_std": 1.0, "grad_norm": 0.1484375, "kl": 0.03915078402496874, "learning_rate": 7.4223999999999994e-06, "loss": 0.0016, "num_tokens": 66706437.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.15338920123050812, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.03331913007423282, "learning_rate": 7.421999999999999e-06, "loss": 0.0013, "num_tokens": 66736949.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 387.59375, "completions/mean_terminated_length": 387.59375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.15349527951628303, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.02698996476829052, "learning_rate": 7.421599999999999e-06, "loss": 0.1122, "num_tokens": 66783496.0, "reward": 3.010918617248535, "reward_std": 0.18900462985038757, "rewards/reward_fn/mean": 3.010918617248535, "rewards/reward_fn/std": 0.18900460004806519, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 294.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.15360135780205791, "frac_reward_zero_std": 1.0, "grad_norm": 0.130859375, "kl": 0.03707153582945466, "learning_rate": 7.421199999999999e-06, "loss": 0.0015, "num_tokens": 66830364.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 505.3125, "completions/mean_terminated_length": 505.3125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.15370743608783283, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.027398626087233424, "learning_rate": 7.420799999999999e-06, "loss": -0.1571, "num_tokens": 66878758.0, "reward": 2.23410701751709, "reward_std": 0.6652016043663025, "rewards/reward_fn/mean": 2.23410701751709, "rewards/reward_fn/std": 0.6652015447616577, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 177.53125, "completions/mean_terminated_length": 177.53125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.1538135143736077, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.024694280233234167, "learning_rate": 7.420399999999999e-06, "loss": 0.001, "num_tokens": 66913015.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 140.0, "completions/max_terminated_length": 140.0, "completions/mean_length": 98.28125, "completions/mean_terminated_length": 98.28125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.15391959265938263, "frac_reward_zero_std": 1.0, "grad_norm": 0.1591796875, "kl": 0.03571772645227611, "learning_rate": 7.419999999999999e-06, "loss": 0.0014, "num_tokens": 66934880.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 349.125, "completions/mean_terminated_length": 349.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.15402567094515754, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.022626615595072508, "learning_rate": 7.419599999999999e-06, "loss": 0.0009, "num_tokens": 66986308.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 218.0, "completions/max_terminated_length": 218.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.15413174923093242, "frac_reward_zero_std": 1.0, "grad_norm": 0.14453125, "kl": 0.03107062610797584, "learning_rate": 7.4192e-06, "loss": 0.0012, "num_tokens": 67026176.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 296.625, "completions/mean_terminated_length": 296.625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.15423782751670734, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.03296915185637772, "learning_rate": 7.4188e-06, "loss": -0.006, "num_tokens": 67097236.0, "reward": 2.750486373901367, "reward_std": 0.2804044187068939, "rewards/reward_fn/mean": 2.750486373901367, "rewards/reward_fn/std": 0.2804044485092163, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 995.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 259.5625, "completions/mean_terminated_length": 259.5625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.15434390580248222, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.030945046106353402, "learning_rate": 7.4184e-06, "loss": -0.0079, "num_tokens": 67147206.0, "reward": 2.977381944656372, "reward_std": 0.19623248279094696, "rewards/reward_fn/mean": 2.977381944656372, "rewards/reward_fn/std": 0.19623248279094696, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 77.8125, "completions/mean_terminated_length": 77.8125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.15444998408825714, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.021643459796905518, "learning_rate": 7.418e-06, "loss": 0.0009, "num_tokens": 67174976.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 401.6875, "completions/mean_terminated_length": 401.6875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.15455606237403205, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.021151326596736908, "learning_rate": 7.4176e-06, "loss": 0.0389, "num_tokens": 67226486.0, "reward": 3.7367396354675293, "reward_std": 0.5055917501449585, "rewards/reward_fn/mean": 3.7367396354675293, "rewards/reward_fn/std": 0.5055916905403137, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1139.0, "completions/mean_length": 540.5625, "completions/mean_terminated_length": 491.9354553222656, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.15466214065980693, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.028610155452042818, "learning_rate": 7.4172e-06, "loss": 0.1628, "num_tokens": 67281736.0, "reward": 2.7234995365142822, "reward_std": 0.5659182667732239, "rewards/reward_fn/mean": 2.7234995365142822, "rewards/reward_fn/std": 0.5659182667732239, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 180.71875, "completions/mean_terminated_length": 180.71875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.15476821894558185, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.0297960857860744, "learning_rate": 7.4168e-06, "loss": 0.0607, "num_tokens": 67324383.0, "reward": 3.912855625152588, "reward_std": 0.2752879559993744, "rewards/reward_fn/mean": 3.912855625152588, "rewards/reward_fn/std": 0.275287926197052, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 84.0, "completions/mean_terminated_length": 84.0, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.15487429723135673, "frac_reward_zero_std": 1.0, "grad_norm": 0.1708984375, "kl": 0.028807405149564147, "learning_rate": 7.4164e-06, "loss": 0.0012, "num_tokens": 67354111.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 197.25, "completions/mean_terminated_length": 197.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.15498037551713165, "frac_reward_zero_std": 1.0, "grad_norm": 0.10693359375, "kl": 0.028027324238792062, "learning_rate": 7.416e-06, "loss": 0.0011, "num_tokens": 67389799.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 243.375, "completions/mean_terminated_length": 243.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.15508645380290653, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.021245840471237898, "learning_rate": 7.4156e-06, "loss": -0.0155, "num_tokens": 67441011.0, "reward": 3.966932773590088, "reward_std": 0.18705597519874573, "rewards/reward_fn/mean": 3.966932773590088, "rewards/reward_fn/std": 0.1870560199022293, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 364.6875, "completions/mean_terminated_length": 364.6875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.15519253208868145, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.026140006259083748, "learning_rate": 7.415199999999999e-06, "loss": -0.0479, "num_tokens": 67468105.0, "reward": 2.71226167678833, "reward_std": 0.1927633434534073, "rewards/reward_fn/mean": 2.71226167678833, "rewards/reward_fn/std": 0.1927633434534073, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 264.875, "completions/mean_terminated_length": 264.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.15529861037445636, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.03174735209904611, "learning_rate": 7.414799999999999e-06, "loss": -0.0027, "num_tokens": 67493285.0, "reward": 2.8976378440856934, "reward_std": 0.04806054010987282, "rewards/reward_fn/mean": 2.8976378440856934, "rewards/reward_fn/std": 0.04806055501103401, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 191.15625, "completions/mean_terminated_length": 191.15625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.15540468866023124, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.027835983550176024, "learning_rate": 7.4144e-06, "loss": 0.0278, "num_tokens": 67536010.0, "reward": 3.7838172912597656, "reward_std": 0.38195741176605225, "rewards/reward_fn/mean": 3.7838172912597656, "rewards/reward_fn/std": 0.38195741176605225, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1234.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 191.9375, "completions/mean_terminated_length": 191.9375, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.15551076694600616, "frac_reward_zero_std": 0.0, "grad_norm": 3.265625, "kl": 0.03196986531838775, "learning_rate": 7.414e-06, "loss": 0.1454, "num_tokens": 67579784.0, "reward": 3.9290502071380615, "reward_std": 0.4013527035713196, "rewards/reward_fn/mean": 3.9290502071380615, "rewards/reward_fn/std": 0.4013526737689972, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 279.4375, "completions/mean_terminated_length": 279.4375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.15561684523178104, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.02996737160719931, "learning_rate": 7.4136e-06, "loss": -0.0289, "num_tokens": 67628854.0, "reward": 3.9293787479400635, "reward_std": 0.3994941711425781, "rewards/reward_fn/mean": 3.9293787479400635, "rewards/reward_fn/std": 0.3994941711425781, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 226.03125, "completions/mean_terminated_length": 226.03125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.15572292351755596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.02296249126084149, "learning_rate": 7.4132e-06, "loss": 0.0009, "num_tokens": 67658199.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 243.71875, "completions/mean_terminated_length": 243.71875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.15582900180333087, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.026919973781332374, "learning_rate": 7.4127999999999996e-06, "loss": 0.0093, "num_tokens": 67711342.0, "reward": 2.8784685134887695, "reward_std": 0.3173023462295532, "rewards/reward_fn/mean": 2.8784685134887695, "rewards/reward_fn/std": 0.31730228662490845, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 220.40625, "completions/mean_terminated_length": 220.40625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.15593508008910575, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.025421129539608955, "learning_rate": 7.4123999999999995e-06, "loss": 0.0564, "num_tokens": 67759163.0, "reward": 2.9090018272399902, "reward_std": 0.3549058735370636, "rewards/reward_fn/mean": 2.9090018272399902, "rewards/reward_fn/std": 0.3549058437347412, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 239.5625, "completions/mean_terminated_length": 239.5625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.15604115837488067, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.03431223169900477, "learning_rate": 7.4119999999999995e-06, "loss": 0.0014, "num_tokens": 67809389.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 169.40625, "completions/mean_terminated_length": 169.40625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.15614723666065555, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.017876636935397983, "learning_rate": 7.4115999999999995e-06, "loss": 0.0007, "num_tokens": 67834682.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 197.8125, "completions/mean_terminated_length": 197.8125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.15625331494643047, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.026579777244478464, "learning_rate": 7.4111999999999994e-06, "loss": -0.0256, "num_tokens": 67882676.0, "reward": 3.851898670196533, "reward_std": 0.4995054602622986, "rewards/reward_fn/mean": 3.851898670196533, "rewards/reward_fn/std": 0.49950549006462097, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 230.40625, "completions/mean_terminated_length": 230.40625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.15635939323220538, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.02615373209118843, "learning_rate": 7.410799999999999e-06, "loss": 0.001, "num_tokens": 67922849.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 175.96875, "completions/mean_terminated_length": 175.96875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.15646547151798026, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.024530835915356874, "learning_rate": 7.410399999999999e-06, "loss": -0.0158, "num_tokens": 67964992.0, "reward": 3.9716320037841797, "reward_std": 0.1604730784893036, "rewards/reward_fn/mean": 3.9716320037841797, "rewards/reward_fn/std": 0.1604730784893036, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 182.84375, "completions/mean_terminated_length": 182.84375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.15657154980375518, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.021006754599511623, "learning_rate": 7.41e-06, "loss": 0.0792, "num_tokens": 68009211.0, "reward": 3.662135362625122, "reward_std": 0.509412944316864, "rewards/reward_fn/mean": 3.662135362625122, "rewards/reward_fn/std": 0.509412944316864, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 222.3125, "completions/mean_terminated_length": 222.3125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.15667762808953006, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.023102863458916545, "learning_rate": 7.4096e-06, "loss": 0.0009, "num_tokens": 68057541.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1068.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 308.875, "completions/mean_terminated_length": 308.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.15678370637530498, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.027743780752643943, "learning_rate": 7.4092e-06, "loss": 0.0634, "num_tokens": 68103777.0, "reward": 3.5075292587280273, "reward_std": 0.9456666707992554, "rewards/reward_fn/mean": 3.5075292587280273, "rewards/reward_fn/std": 0.9456667304039001, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 574.75, "completions/mean_terminated_length": 527.2257690429688, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.1568897846610799, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.02394183212891221, "learning_rate": 7.4088e-06, "loss": 0.1843, "num_tokens": 68157337.0, "reward": 3.4912631511688232, "reward_std": 0.9075994491577148, "rewards/reward_fn/mean": 3.4912631511688232, "rewards/reward_fn/std": 0.9075994491577148, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1163.0, "completions/max_terminated_length": 1163.0, "completions/mean_length": 253.9375, "completions/mean_terminated_length": 253.9375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.15699586294685478, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.036285872804000974, "learning_rate": 7.4084e-06, "loss": -0.0445, "num_tokens": 68200823.0, "reward": 3.738661766052246, "reward_std": 0.4272725582122803, "rewards/reward_fn/mean": 3.738661766052246, "rewards/reward_fn/std": 0.4272725582122803, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 310.25, "completions/mean_terminated_length": 310.25, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.1571019412326297, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.021707464940845966, "learning_rate": 7.408e-06, "loss": 0.1426, "num_tokens": 68248127.0, "reward": 2.7692253589630127, "reward_std": 0.04598098248243332, "rewards/reward_fn/mean": 2.7692253589630127, "rewards/reward_fn/std": 0.04598100483417511, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 259.0625, "completions/mean_terminated_length": 259.0625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.15720801951840457, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830078125, "kl": 0.025276067899540067, "learning_rate": 7.4076e-06, "loss": 0.001, "num_tokens": 68301793.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 295.5625, "completions/mean_terminated_length": 295.5625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.1573140978041795, "frac_reward_zero_std": 0.0, "grad_norm": 1.46875, "kl": 0.0196508695371449, "learning_rate": 7.407199999999999e-06, "loss": 0.0384, "num_tokens": 68347571.0, "reward": 3.862781047821045, "reward_std": 0.43722283840179443, "rewards/reward_fn/mean": 3.862781047821045, "rewards/reward_fn/std": 0.43722283840179443, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 269.6875, "completions/mean_terminated_length": 269.6875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.1574201760899544, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.031088900519534945, "learning_rate": 7.406799999999999e-06, "loss": 0.0976, "num_tokens": 68400873.0, "reward": 2.848465919494629, "reward_std": 0.06664532423019409, "rewards/reward_fn/mean": 2.848465919494629, "rewards/reward_fn/std": 0.0666453167796135, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 222.40625, "completions/mean_terminated_length": 222.40625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.15752625437572929, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.02430442371405661, "learning_rate": 7.406399999999999e-06, "loss": -0.0518, "num_tokens": 68437174.0, "reward": 3.611813545227051, "reward_std": 0.4808964431285858, "rewards/reward_fn/mean": 3.611813545227051, "rewards/reward_fn/std": 0.4808965027332306, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 783.9375, "completions/mean_terminated_length": 653.1724243164062, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.1576323326615042, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.020748317008838058, "learning_rate": 7.405999999999999e-06, "loss": 0.353, "num_tokens": 68507156.0, "reward": 2.5030746459960938, "reward_std": 0.7754137516021729, "rewards/reward_fn/mean": 2.5030746459960938, "rewards/reward_fn/std": 0.7754136919975281, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 430.71875, "completions/mean_terminated_length": 378.5483703613281, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.15773841094727908, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.027423285646364093, "learning_rate": 7.405599999999999e-06, "loss": 0.213, "num_tokens": 68565451.0, "reward": 3.366755247116089, "reward_std": 0.8312036991119385, "rewards/reward_fn/mean": 3.366755247116089, "rewards/reward_fn/std": 0.8312036991119385, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1758.0, "completions/max_terminated_length": 1758.0, "completions/mean_length": 336.375, "completions/mean_terminated_length": 336.375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.157844489233054, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.03361711511388421, "learning_rate": 7.4052e-06, "loss": 0.1764, "num_tokens": 68633303.0, "reward": 3.351813316345215, "reward_std": 0.7536391615867615, "rewards/reward_fn/mean": 3.351813316345215, "rewards/reward_fn/std": 0.7536391615867615, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 228.21875, "completions/mean_terminated_length": 228.21875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.15795056751882888, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.032002222491428256, "learning_rate": 7.4048e-06, "loss": -0.0099, "num_tokens": 68671102.0, "reward": 3.9690937995910645, "reward_std": 0.17483150959014893, "rewards/reward_fn/mean": 3.9690937995910645, "rewards/reward_fn/std": 0.17483149468898773, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 76.8125, "completions/mean_terminated_length": 76.8125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.1580566458046038, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.014173948089592159, "learning_rate": 7.4044e-06, "loss": 0.0006, "num_tokens": 68704120.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 184.40625, "completions/mean_terminated_length": 184.40625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.1581627240903787, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.027895437320694327, "learning_rate": 7.404e-06, "loss": -0.071, "num_tokens": 68741221.0, "reward": 3.5616817474365234, "reward_std": 0.7033450603485107, "rewards/reward_fn/mean": 3.5616817474365234, "rewards/reward_fn/std": 0.7033450603485107, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1376.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 323.90625, "completions/mean_terminated_length": 323.90625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.1582688023761536, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.029311693971976638, "learning_rate": 7.4036e-06, "loss": 0.118, "num_tokens": 68787522.0, "reward": 2.7153244018554688, "reward_std": 0.4882570505142212, "rewards/reward_fn/mean": 2.7153244018554688, "rewards/reward_fn/std": 0.4882570505142212, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 331.65625, "completions/mean_terminated_length": 331.65625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.1583748806619285, "frac_reward_zero_std": 0.0, "grad_norm": 1.359375, "kl": 0.024782405234873295, "learning_rate": 7.4032e-06, "loss": -0.0318, "num_tokens": 68835991.0, "reward": 2.7767367362976074, "reward_std": 0.3278542459011078, "rewards/reward_fn/mean": 2.7767367362976074, "rewards/reward_fn/std": 0.3278542757034302, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1464.0, "completions/max_terminated_length": 1464.0, "completions/mean_length": 321.53125, "completions/mean_terminated_length": 321.53125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.1584809589477034, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.030189570039510727, "learning_rate": 7.4028e-06, "loss": 0.209, "num_tokens": 68878216.0, "reward": 2.947617292404175, "reward_std": 0.026048338040709496, "rewards/reward_fn/mean": 2.947617292404175, "rewards/reward_fn/std": 0.026048310101032257, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 236.3125, "completions/mean_terminated_length": 236.3125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.1585870372334783, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.023946196772158146, "learning_rate": 7.4024e-06, "loss": 0.001, "num_tokens": 68921106.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 162.21875, "completions/mean_terminated_length": 162.21875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.15869311551925322, "frac_reward_zero_std": 1.0, "grad_norm": 0.1279296875, "kl": 0.026347257429733872, "learning_rate": 7.402e-06, "loss": 0.0011, "num_tokens": 68962841.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 242.6875, "completions/mean_terminated_length": 242.6875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.1587991938050281, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.026496655773371458, "learning_rate": 7.4015999999999996e-06, "loss": -0.0276, "num_tokens": 69005231.0, "reward": 3.8105435371398926, "reward_std": 0.4018127918243408, "rewards/reward_fn/mean": 3.8105435371398926, "rewards/reward_fn/std": 0.40181276202201843, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 312.21875, "completions/mean_terminated_length": 312.21875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.15890527209080302, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.021861796732991934, "learning_rate": 7.4011999999999995e-06, "loss": 0.0307, "num_tokens": 69063862.0, "reward": 2.7844386100769043, "reward_std": 1.1609641313552856, "rewards/reward_fn/mean": 2.7844386100769043, "rewards/reward_fn/std": 1.160964012145996, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 708.375, "completions/mean_terminated_length": 619.0667114257812, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.1590113503765779, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.029261509189382195, "learning_rate": 7.4007999999999995e-06, "loss": 0.1809, "num_tokens": 69123202.0, "reward": 2.281765937805176, "reward_std": 0.7985396385192871, "rewards/reward_fn/mean": 2.281765937805176, "rewards/reward_fn/std": 0.7985396385192871, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 333.03125, "completions/mean_terminated_length": 333.03125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.15911742866235282, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.0297744101844728, "learning_rate": 7.4004e-06, "loss": 0.0043, "num_tokens": 69176387.0, "reward": 2.7405807971954346, "reward_std": 0.37365394830703735, "rewards/reward_fn/mean": 2.7405807971954346, "rewards/reward_fn/std": 0.37365394830703735, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 580.5625, "completions/mean_terminated_length": 482.7333679199219, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.15922350694812773, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.02315366081893444, "learning_rate": 7.4e-06, "loss": 0.381, "num_tokens": 69237973.0, "reward": 2.747545003890991, "reward_std": 0.7242361307144165, "rewards/reward_fn/mean": 2.747545003890991, "rewards/reward_fn/std": 0.7242361307144165, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 977.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.15932958523390262, "frac_reward_zero_std": 1.0, "grad_norm": 0.208984375, "kl": 0.024919069837778807, "learning_rate": 7.3996e-06, "loss": 0.001, "num_tokens": 69265745.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 360.5625, "completions/mean_terminated_length": 360.5625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.15943566351967753, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.03635862981900573, "learning_rate": 7.3992e-06, "loss": -0.0237, "num_tokens": 69309059.0, "reward": 2.4815585613250732, "reward_std": 0.4622640609741211, "rewards/reward_fn/mean": 2.4815585613250732, "rewards/reward_fn/std": 0.46226412057876587, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 277.4375, "completions/mean_terminated_length": 277.4375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.15954174180545241, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.022342822514474392, "learning_rate": 7.398799999999999e-06, "loss": 0.0009, "num_tokens": 69362417.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 290.0, "completions/mean_terminated_length": 290.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.15964782009122733, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.025484284618869424, "learning_rate": 7.398399999999999e-06, "loss": 0.1144, "num_tokens": 69405713.0, "reward": 2.7833831310272217, "reward_std": 0.04491547495126724, "rewards/reward_fn/mean": 2.7833831310272217, "rewards/reward_fn/std": 0.04491545632481575, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 210.4375, "completions/mean_terminated_length": 210.4375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.15975389837700224, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.02738275215961039, "learning_rate": 7.397999999999999e-06, "loss": -0.0175, "num_tokens": 69449247.0, "reward": 3.937028646469116, "reward_std": 0.24817818403244019, "rewards/reward_fn/mean": 3.937028646469116, "rewards/reward_fn/std": 0.24817822873592377, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 273.875, "completions/mean_terminated_length": 273.875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.15985997666277713, "frac_reward_zero_std": 1.0, "grad_norm": 0.080078125, "kl": 0.02079133247025311, "learning_rate": 7.397599999999999e-06, "loss": 0.0008, "num_tokens": 69498619.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 205.09375, "completions/mean_terminated_length": 205.09375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.15996605494855204, "frac_reward_zero_std": 0.0, "grad_norm": 2.453125, "kl": 0.031059396918863058, "learning_rate": 7.397199999999999e-06, "loss": 0.2326, "num_tokens": 69547902.0, "reward": 3.931945323944092, "reward_std": 0.2682742774486542, "rewards/reward_fn/mean": 3.931945323944092, "rewards/reward_fn/std": 0.2682742774486542, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 348.375, "completions/mean_terminated_length": 348.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.16007213323432692, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.025881059700623155, "learning_rate": 7.396799999999999e-06, "loss": -0.0134, "num_tokens": 69599018.0, "reward": 2.5926716327667236, "reward_std": 0.1885869950056076, "rewards/reward_fn/mean": 2.5926716327667236, "rewards/reward_fn/std": 0.1885869950056076, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 959.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.16017821152010184, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447265625, "kl": 0.020898002781905234, "learning_rate": 7.396399999999999e-06, "loss": 0.0008, "num_tokens": 69646190.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 298.71875, "completions/mean_terminated_length": 298.71875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.16028428980587675, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.02123245596885681, "learning_rate": 7.395999999999999e-06, "loss": -0.0262, "num_tokens": 69696421.0, "reward": 3.7587828636169434, "reward_std": 0.6745774149894714, "rewards/reward_fn/mean": 3.7587828636169434, "rewards/reward_fn/std": 0.6745774745941162, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 360.75, "completions/mean_terminated_length": 360.75, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.16039036809165164, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.029440977377817035, "learning_rate": 7.3956e-06, "loss": 0.0316, "num_tokens": 69741565.0, "reward": 3.9266085624694824, "reward_std": 0.41516539454460144, "rewards/reward_fn/mean": 3.9266085624694824, "rewards/reward_fn/std": 0.41516542434692383, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 177.0625, "completions/mean_terminated_length": 177.0625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.16049644637742655, "frac_reward_zero_std": 0.0, "grad_norm": 2.296875, "kl": 0.015543692628853023, "learning_rate": 7.3952e-06, "loss": -0.0021, "num_tokens": 69788575.0, "reward": 3.473254680633545, "reward_std": 0.8447170853614807, "rewards/reward_fn/mean": 3.473254680633545, "rewards/reward_fn/std": 0.8447170853614807, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 410.625, "completions/mean_terminated_length": 410.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.16060252466320143, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.0255625550635159, "learning_rate": 7.3948e-06, "loss": 0.1049, "num_tokens": 69835827.0, "reward": 2.6098246574401855, "reward_std": 0.272195965051651, "rewards/reward_fn/mean": 2.6098246574401855, "rewards/reward_fn/std": 0.272195965051651, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 227.625, "completions/mean_terminated_length": 227.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.16070860294897635, "frac_reward_zero_std": 1.0, "grad_norm": 0.0732421875, "kl": 0.0203583559487015, "learning_rate": 7.3944e-06, "loss": 0.0008, "num_tokens": 69890279.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 176.90625, "completions/mean_terminated_length": 176.90625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.16081468123475123, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.019284927984699607, "learning_rate": 7.394e-06, "loss": 0.0008, "num_tokens": 69928644.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 166.0, "completions/mean_terminated_length": 166.0, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.16092075952052615, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.05426881415769458, "learning_rate": 7.3936e-06, "loss": 0.0055, "num_tokens": 69978820.0, "reward": 3.9703755378723145, "reward_std": 0.16758133471012115, "rewards/reward_fn/mean": 3.9703755378723145, "rewards/reward_fn/std": 0.16758134961128235, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 452.96875, "completions/mean_terminated_length": 401.51611328125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.16102683780630106, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.02300673839636147, "learning_rate": 7.3932e-06, "loss": 0.2649, "num_tokens": 70017987.0, "reward": 2.9328155517578125, "reward_std": 0.6929539442062378, "rewards/reward_fn/mean": 2.9328155517578125, "rewards/reward_fn/std": 0.6929539442062378, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 86.5625, "completions/mean_terminated_length": 86.5625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.16113291609207595, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.013300622056704015, "learning_rate": 7.3928e-06, "loss": 0.0005, "num_tokens": 70041269.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 251.375, "completions/mean_terminated_length": 251.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.16123899437785086, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.020807479857467115, "learning_rate": 7.3924e-06, "loss": 0.0008, "num_tokens": 70104513.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 144.375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.16134507266362574, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.017212234903126955, "learning_rate": 7.392e-06, "loss": 0.1497, "num_tokens": 70155629.0, "reward": 2.998427629470825, "reward_std": 0.039904408156871796, "rewards/reward_fn/mean": 2.998427629470825, "rewards/reward_fn/std": 0.039904408156871796, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 270.8125, "completions/mean_terminated_length": 270.8125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.16145115094940066, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.027058127569034696, "learning_rate": 7.3916e-06, "loss": 0.0011, "num_tokens": 70211559.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1306.0, "completions/max_terminated_length": 1306.0, "completions/mean_length": 404.4375, "completions/mean_terminated_length": 404.4375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.16155722923517557, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.02585031627677381, "learning_rate": 7.3912000000000005e-06, "loss": -0.0859, "num_tokens": 70285653.0, "reward": 3.6108155250549316, "reward_std": 0.5108808875083923, "rewards/reward_fn/mean": 3.6108155250549316, "rewards/reward_fn/std": 0.5108808279037476, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 265.375, "completions/mean_terminated_length": 265.375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.16166330752095046, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.021037982078269124, "learning_rate": 7.3908e-06, "loss": -0.0172, "num_tokens": 70327169.0, "reward": 3.0207977294921875, "reward_std": 0.037842877209186554, "rewards/reward_fn/mean": 3.0207977294921875, "rewards/reward_fn/std": 0.03784283623099327, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 102.78125, "completions/mean_terminated_length": 102.78125, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.16176938580672537, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "kl": 0.02766521042212844, "learning_rate": 7.3904e-06, "loss": -0.0696, "num_tokens": 70366170.0, "reward": 3.8455190658569336, "reward_std": 0.3647652268409729, "rewards/reward_fn/mean": 3.8455190658569336, "rewards/reward_fn/std": 0.3647651970386505, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1333.0, "completions/max_terminated_length": 1333.0, "completions/mean_length": 192.96875, "completions/mean_terminated_length": 192.96875, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.16187546409250025, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.020648099714890122, "learning_rate": 7.3899999999999995e-06, "loss": 0.0008, "num_tokens": 70402553.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 141.09375, "completions/mean_terminated_length": 141.09375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.16198154237827517, "frac_reward_zero_std": 1.0, "grad_norm": 0.140625, "kl": 0.02335872733965516, "learning_rate": 7.3895999999999995e-06, "loss": 0.0009, "num_tokens": 70447708.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 201.6875, "completions/mean_terminated_length": 201.6875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.16208762066405008, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.02256969455629587, "learning_rate": 7.3891999999999995e-06, "loss": 0.0009, "num_tokens": 70483954.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1466.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 353.5625, "completions/mean_terminated_length": 353.5625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.16219369894982497, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.042810263112187386, "learning_rate": 7.3887999999999995e-06, "loss": 0.0261, "num_tokens": 70534692.0, "reward": 2.9112634658813477, "reward_std": 0.054259590804576874, "rewards/reward_fn/mean": 2.9112634658813477, "rewards/reward_fn/std": 0.054259564727544785, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 203.75, "completions/mean_terminated_length": 203.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.16229977723559988, "frac_reward_zero_std": 1.0, "grad_norm": 0.1474609375, "kl": 0.027603084221482277, "learning_rate": 7.3883999999999994e-06, "loss": 0.0011, "num_tokens": 70570908.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1341.0, "completions/max_terminated_length": 1341.0, "completions/mean_length": 443.78125, "completions/mean_terminated_length": 443.78125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.16240585552137476, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.0299779511988163, "learning_rate": 7.387999999999999e-06, "loss": 0.0348, "num_tokens": 70638805.0, "reward": 2.931946277618408, "reward_std": 0.0676136463880539, "rewards/reward_fn/mean": 2.931946277618408, "rewards/reward_fn/std": 0.06761366128921509, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1183.0, "completions/max_terminated_length": 1183.0, "completions/mean_length": 322.625, "completions/mean_terminated_length": 322.625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.16251193380714968, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.02347099781036377, "learning_rate": 7.387599999999999e-06, "loss": 0.0192, "num_tokens": 70693225.0, "reward": 3.963925838470459, "reward_std": 0.2040664255619049, "rewards/reward_fn/mean": 3.963925838470459, "rewards/reward_fn/std": 0.2040664404630661, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 175.59375, "completions/mean_terminated_length": 175.59375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.1626180120929246, "frac_reward_zero_std": 0.0, "grad_norm": 3.0625, "kl": 0.028762807371094823, "learning_rate": 7.387199999999999e-06, "loss": 0.0637, "num_tokens": 70734460.0, "reward": 3.963350296020508, "reward_std": 0.20732258260250092, "rewards/reward_fn/mean": 3.963350296020508, "rewards/reward_fn/std": 0.20732256770133972, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 266.03125, "completions/mean_terminated_length": 266.03125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.16272409037869948, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.018660985631868243, "learning_rate": 7.386799999999999e-06, "loss": 0.0007, "num_tokens": 70788509.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 164.75, "completions/mean_terminated_length": 164.75, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.1628301686644744, "frac_reward_zero_std": 0.0, "grad_norm": 2.9375, "kl": 0.027147594606503844, "learning_rate": 7.3864e-06, "loss": 0.1217, "num_tokens": 70835989.0, "reward": 3.89919376373291, "reward_std": 0.3184683322906494, "rewards/reward_fn/mean": 3.89919376373291, "rewards/reward_fn/std": 0.318468302488327, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 167.0, "completions/max_terminated_length": 167.0, "completions/mean_length": 92.625, "completions/mean_terminated_length": 92.625, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.16293624695024927, "frac_reward_zero_std": 1.0, "grad_norm": 0.126953125, "kl": 0.01960705651436001, "learning_rate": 7.386e-06, "loss": 0.0008, "num_tokens": 70873065.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 145.71875, "completions/mean_terminated_length": 145.71875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.1630423252360242, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.019639571546576917, "learning_rate": 7.3856e-06, "loss": 0.0008, "num_tokens": 70920544.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 153.40625, "completions/mean_terminated_length": 153.40625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.1631484035217991, "frac_reward_zero_std": 1.0, "grad_norm": 0.12060546875, "kl": 0.021374219097197056, "learning_rate": 7.3852e-06, "loss": 0.0009, "num_tokens": 70947181.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1104.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 327.65625, "completions/mean_terminated_length": 327.65625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.163254481807574, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.030522728571668267, "learning_rate": 7.3848e-06, "loss": 0.0377, "num_tokens": 70987554.0, "reward": 2.9481630325317383, "reward_std": 0.22395570576190948, "rewards/reward_fn/mean": 2.9481630325317383, "rewards/reward_fn/std": 0.22395570576190948, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 466.15625, "completions/mean_terminated_length": 466.15625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.1633605600933489, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.029136600205674767, "learning_rate": 7.3844e-06, "loss": 0.047, "num_tokens": 71050631.0, "reward": 2.643458366394043, "reward_std": 0.3626547157764435, "rewards/reward_fn/mean": 2.643458366394043, "rewards/reward_fn/std": 0.3626546859741211, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 873.0, "completions/max_terminated_length": 873.0, "completions/mean_length": 280.8125, "completions/mean_terminated_length": 280.8125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.16346663837912379, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.026954283006489277, "learning_rate": 7.384e-06, "loss": 0.0118, "num_tokens": 71082049.0, "reward": 3.7240138053894043, "reward_std": 0.44852492213249207, "rewards/reward_fn/mean": 3.7240138053894043, "rewards/reward_fn/std": 0.4485248923301697, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1465.0, "completions/max_terminated_length": 1465.0, "completions/mean_length": 354.84375, "completions/mean_terminated_length": 354.84375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.1635727166648987, "frac_reward_zero_std": 0.0, "grad_norm": 0.53515625, "kl": 0.025874186540022492, "learning_rate": 7.3836e-06, "loss": -0.1326, "num_tokens": 71132028.0, "reward": 2.750469207763672, "reward_std": 0.20693431794643402, "rewards/reward_fn/mean": 2.750469207763672, "rewards/reward_fn/std": 0.20693430304527283, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 248.59375, "completions/mean_terminated_length": 248.59375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.16367879495067358, "frac_reward_zero_std": 1.0, "grad_norm": 0.111328125, "kl": 0.0319938138127327, "learning_rate": 7.3832e-06, "loss": 0.0013, "num_tokens": 71177263.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1375.0, "completions/mean_length": 798.59375, "completions/mean_terminated_length": 758.290283203125, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "epoch": 0.1637848732364485, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.021971626905724406, "learning_rate": 7.382799999999999e-06, "loss": 0.1853, "num_tokens": 71245058.0, "reward": 2.5030364990234375, "reward_std": 0.6058197021484375, "rewards/reward_fn/mean": 2.5030364990234375, "rewards/reward_fn/std": 0.6058197021484375, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 197.3125, "completions/mean_terminated_length": 197.3125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.1638909515222234, "frac_reward_zero_std": 1.0, "grad_norm": 0.09765625, "kl": 0.020507124136202037, "learning_rate": 7.382399999999999e-06, "loss": 0.0008, "num_tokens": 71277932.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 262.3125, "completions/mean_terminated_length": 262.3125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.1639970298079983, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.027538377791643143, "learning_rate": 7.381999999999999e-06, "loss": 0.0391, "num_tokens": 71344630.0, "reward": 2.704584836959839, "reward_std": 0.04232628643512726, "rewards/reward_fn/mean": 2.704584836959839, "rewards/reward_fn/std": 0.042326249182224274, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1643.0, "completions/mean_length": 497.6875, "completions/mean_terminated_length": 447.6773986816406, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.1641031080937732, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.03232635045424104, "learning_rate": 7.3816e-06, "loss": 0.0815, "num_tokens": 71395532.0, "reward": 2.349479913711548, "reward_std": 0.6918059587478638, "rewards/reward_fn/mean": 2.349479913711548, "rewards/reward_fn/std": 0.6918059587478638, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 255.40625, "completions/mean_terminated_length": 255.40625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.1642091863795481, "frac_reward_zero_std": 1.0, "grad_norm": 0.08154296875, "kl": 0.018391662510111928, "learning_rate": 7.3812e-06, "loss": 0.0007, "num_tokens": 71460441.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 456.8125, "completions/mean_terminated_length": 456.8125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.164315264665323, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.02743516000919044, "learning_rate": 7.3808e-06, "loss": 0.0942, "num_tokens": 71514451.0, "reward": 3.201913356781006, "reward_std": 0.6840846538543701, "rewards/reward_fn/mean": 3.201913356781006, "rewards/reward_fn/std": 0.6840846538543701, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 287.1875, "completions/mean_terminated_length": 287.1875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.16442134295109792, "frac_reward_zero_std": 1.0, "grad_norm": 0.1015625, "kl": 0.02359214937314391, "learning_rate": 7.3804e-06, "loss": 0.0009, "num_tokens": 71560633.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 379.65625, "completions/mean_terminated_length": 379.65625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.1645274212368728, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.027676088735461235, "learning_rate": 7.38e-06, "loss": 0.0507, "num_tokens": 71621486.0, "reward": 2.9698777198791504, "reward_std": 0.056896451860666275, "rewards/reward_fn/mean": 2.9698777198791504, "rewards/reward_fn/std": 0.05689648166298866, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 294.40625, "completions/mean_terminated_length": 294.40625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.16463349952264772, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.022418924141675234, "learning_rate": 7.3796e-06, "loss": 0.0377, "num_tokens": 71668123.0, "reward": 2.8232269287109375, "reward_std": 0.2750570774078369, "rewards/reward_fn/mean": 2.8232269287109375, "rewards/reward_fn/std": 0.2750571072101593, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 321.78125, "completions/mean_terminated_length": 321.78125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.1647395778084226, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.030020848847925663, "learning_rate": 7.3792e-06, "loss": 0.073, "num_tokens": 71715380.0, "reward": 3.7270565032958984, "reward_std": 0.5574728846549988, "rewards/reward_fn/mean": 3.7270565032958984, "rewards/reward_fn/std": 0.5574728846549988, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 282.1875, "completions/mean_terminated_length": 282.1875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.16484565609419752, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.01917832251638174, "learning_rate": 7.3787999999999996e-06, "loss": -0.0677, "num_tokens": 71762618.0, "reward": 3.861374616622925, "reward_std": 0.37316587567329407, "rewards/reward_fn/mean": 3.861374616622925, "rewards/reward_fn/std": 0.37316587567329407, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 219.625, "completions/mean_terminated_length": 219.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.16495173437997243, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.028847611974924803, "learning_rate": 7.3783999999999995e-06, "loss": -0.0166, "num_tokens": 71807982.0, "reward": 3.9281513690948486, "reward_std": 0.40643757581710815, "rewards/reward_fn/mean": 3.9281513690948486, "rewards/reward_fn/std": 0.40643760561943054, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 321.25, "completions/mean_terminated_length": 321.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.16505781266574732, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.03293606429360807, "learning_rate": 7.3779999999999995e-06, "loss": 0.1044, "num_tokens": 71862006.0, "reward": 2.8002994060516357, "reward_std": 0.053148169070482254, "rewards/reward_fn/mean": 2.8002994060516357, "rewards/reward_fn/std": 0.05314814671874046, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1651.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 360.21875, "completions/mean_terminated_length": 360.21875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.16516389095152223, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.031179531943053007, "learning_rate": 7.3775999999999995e-06, "loss": 0.0442, "num_tokens": 71911549.0, "reward": 2.779324531555176, "reward_std": 0.4085378348827362, "rewards/reward_fn/mean": 2.779324531555176, "rewards/reward_fn/std": 0.4085378348827362, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 500.25, "completions/mean_terminated_length": 450.32257080078125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.16526996923729712, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.027880383422598243, "learning_rate": 7.3772e-06, "loss": 0.2164, "num_tokens": 71978277.0, "reward": 2.6912436485290527, "reward_std": 0.5629785060882568, "rewards/reward_fn/mean": 2.6912436485290527, "rewards/reward_fn/std": 0.5629785060882568, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 217.65625, "completions/mean_terminated_length": 217.65625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.16537604752307203, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.025317950639873743, "learning_rate": 7.3768e-06, "loss": -0.0064, "num_tokens": 72020922.0, "reward": 3.0498218536376953, "reward_std": 0.0369083546102047, "rewards/reward_fn/mean": 3.0498218536376953, "rewards/reward_fn/std": 0.0369083397090435, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1086.0, "completions/max_terminated_length": 1086.0, "completions/mean_length": 262.90625, "completions/mean_terminated_length": 262.90625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.16548212580884694, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.02574088191613555, "learning_rate": 7.3764e-06, "loss": -0.0193, "num_tokens": 72049975.0, "reward": 3.622433662414551, "reward_std": 0.8289299607276917, "rewards/reward_fn/mean": 3.622433662414551, "rewards/reward_fn/std": 0.8289299607276917, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 152.625, "completions/mean_terminated_length": 152.625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.16558820409462183, "frac_reward_zero_std": 1.0, "grad_norm": 0.12353515625, "kl": 0.021092961658723652, "learning_rate": 7.376e-06, "loss": 0.0008, "num_tokens": 72086347.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 124.40625, "completions/mean_terminated_length": 124.40625, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.16569428238039674, "frac_reward_zero_std": 0.0, "grad_norm": 2.71875, "kl": 0.029914353508502245, "learning_rate": 7.3756e-06, "loss": 0.1327, "num_tokens": 72125208.0, "reward": 2.8506991863250732, "reward_std": 0.03070419654250145, "rewards/reward_fn/mean": 2.8506991863250732, "rewards/reward_fn/std": 0.030704230070114136, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.16580036066617163, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.03901820583269, "learning_rate": 7.3752e-06, "loss": 0.0406, "num_tokens": 72173097.0, "reward": 3.0672106742858887, "reward_std": 0.3122633397579193, "rewards/reward_fn/mean": 3.0672106742858887, "rewards/reward_fn/std": 0.3122633397579193, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 160.21875, "completions/mean_terminated_length": 160.21875, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.16590643895194654, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.029396470403298736, "learning_rate": 7.374799999999999e-06, "loss": -0.0484, "num_tokens": 72198128.0, "reward": 3.639256477355957, "reward_std": 0.5450024604797363, "rewards/reward_fn/mean": 3.639256477355957, "rewards/reward_fn/std": 0.5450024604797363, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 185.875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.16601251723772145, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.05406060954555869, "learning_rate": 7.374399999999999e-06, "loss": 0.0989, "num_tokens": 72225964.0, "reward": 3.8368563652038574, "reward_std": 0.3858475089073181, "rewards/reward_fn/mean": 3.8368563652038574, "rewards/reward_fn/std": 0.3858474791049957, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1062.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 372.65625, "completions/mean_terminated_length": 372.65625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.16611859552349634, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.022666738834232092, "learning_rate": 7.373999999999999e-06, "loss": -0.0341, "num_tokens": 72285057.0, "reward": 3.9279088973999023, "reward_std": 0.40780818462371826, "rewards/reward_fn/mean": 3.9279088973999023, "rewards/reward_fn/std": 0.40780818462371826, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 389.0, "completions/mean_terminated_length": 335.4838562011719, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.16622467380927125, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.02747956058010459, "learning_rate": 7.373599999999999e-06, "loss": 0.154, "num_tokens": 72340545.0, "reward": 2.9696366786956787, "reward_std": 0.44867783784866333, "rewards/reward_fn/mean": 2.9696366786956787, "rewards/reward_fn/std": 0.44867780804634094, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 209.875, "completions/mean_terminated_length": 209.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.16633075209504614, "frac_reward_zero_std": 1.0, "grad_norm": 0.1142578125, "kl": 0.0265498380176723, "learning_rate": 7.373199999999999e-06, "loss": 0.0011, "num_tokens": 72398941.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 176.28125, "completions/mean_terminated_length": 176.28125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.16643683038082105, "frac_reward_zero_std": 1.0, "grad_norm": 0.1396484375, "kl": 0.027999462094157934, "learning_rate": 7.372799999999999e-06, "loss": 0.0011, "num_tokens": 72469094.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 188.53125, "completions/mean_terminated_length": 188.53125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.16654290866659593, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.028627387015148997, "learning_rate": 7.3724e-06, "loss": -0.1119, "num_tokens": 72509143.0, "reward": 2.8793563842773438, "reward_std": 0.6584159135818481, "rewards/reward_fn/mean": 2.8793563842773438, "rewards/reward_fn/std": 0.6584158539772034, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1219.0, "completions/max_terminated_length": 1219.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.16664898695237085, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.02936831465922296, "learning_rate": 7.372e-06, "loss": 0.098, "num_tokens": 72537067.0, "reward": 2.8143582344055176, "reward_std": 0.03991476818919182, "rewards/reward_fn/mean": 2.8143582344055176, "rewards/reward_fn/std": 0.03991476073861122, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.16675506523814576, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005859375, "kl": 0.027146983658894897, "learning_rate": 7.3716e-06, "loss": 0.0011, "num_tokens": 72564003.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 207.0, "completions/max_terminated_length": 207.0, "completions/mean_length": 150.625, "completions/mean_terminated_length": 150.625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.16686114352392065, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.023954558419063687, "learning_rate": 7.3712e-06, "loss": 0.001, "num_tokens": 72630999.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 285.28125, "completions/mean_terminated_length": 285.28125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.16696722180969556, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.027458957163617015, "learning_rate": 7.3708e-06, "loss": 0.0287, "num_tokens": 72683744.0, "reward": 2.6126034259796143, "reward_std": 0.7907987236976624, "rewards/reward_fn/mean": 2.6126034259796143, "rewards/reward_fn/std": 0.7907987236976624, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1310.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 438.21875, "completions/mean_terminated_length": 438.21875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.16707330009547045, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.026026516687124968, "learning_rate": 7.3704e-06, "loss": 0.114, "num_tokens": 72730439.0, "reward": 3.1812829971313477, "reward_std": 0.5636266469955444, "rewards/reward_fn/mean": 3.1812829971313477, "rewards/reward_fn/std": 0.5636265873908997, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 194.09375, "completions/mean_terminated_length": 194.09375, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.16717937838124536, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.025642277905717492, "learning_rate": 7.37e-06, "loss": 0.0384, "num_tokens": 72779370.0, "reward": 3.8860883712768555, "reward_std": 0.468019962310791, "rewards/reward_fn/mean": 3.8860883712768555, "rewards/reward_fn/std": 0.468019962310791, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 213.1875, "completions/mean_terminated_length": 213.1875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.16728545666702027, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.03445283626206219, "learning_rate": 7.3696e-06, "loss": -0.2059, "num_tokens": 72815856.0, "reward": 3.205237865447998, "reward_std": 0.4682815372943878, "rewards/reward_fn/mean": 3.205237865447998, "rewards/reward_fn/std": 0.46828150749206543, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1284.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 308.40625, "completions/mean_terminated_length": 308.40625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.16739153495279516, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.024654962122440338, "learning_rate": 7.3692e-06, "loss": 0.001, "num_tokens": 72863901.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 313.21875, "completions/mean_terminated_length": 313.21875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.16749761323857007, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.02547546150162816, "learning_rate": 7.3688e-06, "loss": 0.0737, "num_tokens": 72922372.0, "reward": 3.0208816528320312, "reward_std": 0.756123423576355, "rewards/reward_fn/mean": 3.0208816528320312, "rewards/reward_fn/std": 0.7561233639717102, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 192.5625, "completions/mean_terminated_length": 192.5625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.16760369152434496, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.04026231449097395, "learning_rate": 7.3684e-06, "loss": -0.0245, "num_tokens": 72963222.0, "reward": 3.9631948471069336, "reward_std": 0.2082015424966812, "rewards/reward_fn/mean": 3.9631948471069336, "rewards/reward_fn/std": 0.2082015424966812, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1534.0, "completions/max_terminated_length": 1534.0, "completions/mean_length": 387.25, "completions/mean_terminated_length": 387.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.16770976981011987, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.029226713813841343, "learning_rate": 7.368e-06, "loss": 0.0179, "num_tokens": 73006558.0, "reward": 2.7275261878967285, "reward_std": 0.326748251914978, "rewards/reward_fn/mean": 2.7275261878967285, "rewards/reward_fn/std": 0.32674822211265564, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 127.5, "completions/mean_terminated_length": 127.5, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.16781584809589478, "frac_reward_zero_std": 1.0, "grad_norm": 0.1591796875, "kl": 0.03317599557340145, "learning_rate": 7.3676e-06, "loss": 0.0013, "num_tokens": 73053486.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 281.9375, "completions/mean_terminated_length": 281.9375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.16792192638166967, "frac_reward_zero_std": 1.0, "grad_norm": 0.09716796875, "kl": 0.022451504366472363, "learning_rate": 7.3672e-06, "loss": 0.0009, "num_tokens": 73086188.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 163.0625, "completions/mean_terminated_length": 163.0625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.16802800466744458, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.026922843884676695, "learning_rate": 7.3667999999999995e-06, "loss": 0.0565, "num_tokens": 73119246.0, "reward": 2.860722064971924, "reward_std": 0.0443354956805706, "rewards/reward_fn/mean": 2.860722064971924, "rewards/reward_fn/std": 0.04433548450469971, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1794.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 516.78125, "completions/mean_terminated_length": 516.78125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.16813408295321947, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.027608325239270926, "learning_rate": 7.3663999999999995e-06, "loss": 0.0582, "num_tokens": 73182151.0, "reward": 3.1305994987487793, "reward_std": 0.9841459393501282, "rewards/reward_fn/mean": 3.1305994987487793, "rewards/reward_fn/std": 0.9841459393501282, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 214.09375, "completions/mean_terminated_length": 214.09375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.16824016123899438, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.02348411502316594, "learning_rate": 7.3659999999999994e-06, "loss": 0.0196, "num_tokens": 73222634.0, "reward": 3.9632420539855957, "reward_std": 0.2079339474439621, "rewards/reward_fn/mean": 3.9632420539855957, "rewards/reward_fn/std": 0.2079339176416397, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 899.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 314.28125, "completions/mean_terminated_length": 314.28125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.1683462395247693, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.0289424117654562, "learning_rate": 7.365599999999999e-06, "loss": 0.1943, "num_tokens": 73276947.0, "reward": 3.456662178039551, "reward_std": 0.8585535883903503, "rewards/reward_fn/mean": 3.456662178039551, "rewards/reward_fn/std": 0.8585535883903503, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 123.40625, "completions/mean_terminated_length": 123.40625, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.16845231781054418, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.029007577802985907, "learning_rate": 7.365199999999999e-06, "loss": 0.0012, "num_tokens": 73312480.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1397.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 364.125, "completions/mean_terminated_length": 364.125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.1685583960963191, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.0202113245613873, "learning_rate": 7.364799999999999e-06, "loss": -0.0049, "num_tokens": 73358404.0, "reward": 3.0420591831207275, "reward_std": 0.37027791142463684, "rewards/reward_fn/mean": 3.0420591831207275, "rewards/reward_fn/std": 0.37027788162231445, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 467.5, "completions/mean_terminated_length": 416.51611328125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.16866447438209398, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.019788505509495735, "learning_rate": 7.364399999999999e-06, "loss": 0.2435, "num_tokens": 73411316.0, "reward": 2.620427370071411, "reward_std": 0.519374668598175, "rewards/reward_fn/mean": 2.620427370071411, "rewards/reward_fn/std": 0.519374668598175, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 339.25, "completions/mean_terminated_length": 339.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.1687705526678689, "frac_reward_zero_std": 1.0, "grad_norm": 0.1494140625, "kl": 0.03387824585661292, "learning_rate": 7.363999999999999e-06, "loss": 0.0014, "num_tokens": 73453916.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 276.1875, "completions/mean_terminated_length": 276.1875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.1688766309536438, "frac_reward_zero_std": 1.0, "grad_norm": 0.09765625, "kl": 0.026950898114591837, "learning_rate": 7.363599999999999e-06, "loss": 0.0011, "num_tokens": 73484098.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 293.84375, "completions/mean_terminated_length": 293.84375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.1689827092394187, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.0297443438321352, "learning_rate": 7.363199999999999e-06, "loss": 0.0417, "num_tokens": 73524253.0, "reward": 3.6159801483154297, "reward_std": 0.506695032119751, "rewards/reward_fn/mean": 3.6159801483154297, "rewards/reward_fn/std": 0.506695032119751, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 434.375, "completions/mean_terminated_length": 434.375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.1690887875251936, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.022996835177764297, "learning_rate": 7.3628e-06, "loss": -0.0086, "num_tokens": 73581993.0, "reward": 3.1963114738464355, "reward_std": 0.5918622612953186, "rewards/reward_fn/mean": 3.1963114738464355, "rewards/reward_fn/std": 0.5918623208999634, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 202.3125, "completions/mean_terminated_length": 202.3125, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.1691948658109685, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.02059872355312109, "learning_rate": 7.3624e-06, "loss": 0.0376, "num_tokens": 73628723.0, "reward": 3.9325742721557617, "reward_std": 0.3814173638820648, "rewards/reward_fn/mean": 3.9325742721557617, "rewards/reward_fn/std": 0.38141733407974243, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 228.5625, "completions/mean_terminated_length": 228.5625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.1693009440967434, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.02786667738109827, "learning_rate": 7.362e-06, "loss": 0.04, "num_tokens": 73666501.0, "reward": 3.7850565910339355, "reward_std": 0.5395143628120422, "rewards/reward_fn/mean": 3.7850565910339355, "rewards/reward_fn/std": 0.5395143628120422, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 337.75, "completions/mean_terminated_length": 337.75, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.16940702238251829, "frac_reward_zero_std": 0.0, "grad_norm": 1.0, "kl": 0.02589184185490012, "learning_rate": 7.3616e-06, "loss": -0.0659, "num_tokens": 73713885.0, "reward": 2.7677502632141113, "reward_std": 0.19476144015789032, "rewards/reward_fn/mean": 2.7677502632141113, "rewards/reward_fn/std": 0.1947614550590515, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1111.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 375.09375, "completions/mean_terminated_length": 375.09375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.1695131006682932, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.01787739770952612, "learning_rate": 7.3612e-06, "loss": -0.0086, "num_tokens": 73771200.0, "reward": 3.8554279804229736, "reward_std": 0.5688852071762085, "rewards/reward_fn/mean": 3.8554279804229736, "rewards/reward_fn/std": 0.5688852071762085, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 238.1875, "completions/mean_terminated_length": 238.1875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.1696191789540681, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.024122617673128843, "learning_rate": 7.3608e-06, "loss": -0.0122, "num_tokens": 73835718.0, "reward": 2.7904884815216064, "reward_std": 0.028905630111694336, "rewards/reward_fn/mean": 2.7904884815216064, "rewards/reward_fn/std": 0.02890562266111374, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 207.09375, "completions/mean_terminated_length": 207.09375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.169725257239843, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.03663892112672329, "learning_rate": 7.3604e-06, "loss": 0.0213, "num_tokens": 73885321.0, "reward": 2.8401732444763184, "reward_std": 0.19660231471061707, "rewards/reward_fn/mean": 2.8401732444763184, "rewards/reward_fn/std": 0.19660234451293945, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 403.5, "completions/mean_terminated_length": 350.45159912109375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.1698313355256179, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.02747776103205979, "learning_rate": 7.36e-06, "loss": 0.256, "num_tokens": 73948569.0, "reward": 2.654446601867676, "reward_std": 0.600631058216095, "rewards/reward_fn/mean": 2.654446601867676, "rewards/reward_fn/std": 0.600631058216095, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1447.0, "completions/max_terminated_length": 1447.0, "completions/mean_length": 322.1875, "completions/mean_terminated_length": 322.1875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.1699374138113928, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.030739419627934694, "learning_rate": 7.3596e-06, "loss": 0.0544, "num_tokens": 73980255.0, "reward": 3.819499969482422, "reward_std": 0.5079775452613831, "rewards/reward_fn/mean": 3.819499969482422, "rewards/reward_fn/std": 0.5079775452613831, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 186.0, "completions/max_terminated_length": 186.0, "completions/mean_length": 140.1875, "completions/mean_terminated_length": 140.1875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.1700434920971677, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.026381214149296284, "learning_rate": 7.3592e-06, "loss": 0.0011, "num_tokens": 74014885.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 496.78125, "completions/mean_terminated_length": 496.78125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.17014957038294262, "frac_reward_zero_std": 0.0, "grad_norm": 1.1015625, "kl": 0.023547572316601872, "learning_rate": 7.358799999999999e-06, "loss": 0.017, "num_tokens": 74071262.0, "reward": 3.6152219772338867, "reward_std": 0.6692531108856201, "rewards/reward_fn/mean": 3.6152219772338867, "rewards/reward_fn/std": 0.6692530512809753, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 533.1875, "completions/mean_terminated_length": 533.1875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.1702556486687175, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.02932202792726457, "learning_rate": 7.358399999999999e-06, "loss": -0.0112, "num_tokens": 74124580.0, "reward": 2.6716699600219727, "reward_std": 0.3422979414463043, "rewards/reward_fn/mean": 2.6716699600219727, "rewards/reward_fn/std": 0.34229791164398193, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 226.53125, "completions/mean_terminated_length": 226.53125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.17036172695449242, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.02531708194874227, "learning_rate": 7.358e-06, "loss": 0.0259, "num_tokens": 74148757.0, "reward": 2.9681363105773926, "reward_std": 0.04596217721700668, "rewards/reward_fn/mean": 2.9681363105773926, "rewards/reward_fn/std": 0.04596218094229698, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2008.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 379.90625, "completions/mean_terminated_length": 379.90625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.1704678052402673, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.0387666889000684, "learning_rate": 7.3576e-06, "loss": -0.0632, "num_tokens": 74189490.0, "reward": 2.6473255157470703, "reward_std": 0.4680839478969574, "rewards/reward_fn/mean": 2.6473255157470703, "rewards/reward_fn/std": 0.468083918094635, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 151.875, "completions/mean_terminated_length": 151.875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.17057388352604222, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.02067621098831296, "learning_rate": 7.3572e-06, "loss": 0.0394, "num_tokens": 74237422.0, "reward": 3.9700491428375244, "reward_std": 0.16942748427391052, "rewards/reward_fn/mean": 3.9700491428375244, "rewards/reward_fn/std": 0.16942746937274933, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 267.5625, "completions/mean_terminated_length": 267.5625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.17067996181181713, "frac_reward_zero_std": 1.0, "grad_norm": 0.33984375, "kl": 0.03733672644011676, "learning_rate": 7.3568e-06, "loss": 0.0015, "num_tokens": 74283328.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 193.03125, "completions/mean_terminated_length": 193.03125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.17078604009759202, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.028381220996379852, "learning_rate": 7.3563999999999996e-06, "loss": 0.0011, "num_tokens": 74317025.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 217.75, "completions/mean_terminated_length": 217.75, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.17089211838336693, "frac_reward_zero_std": 1.0, "grad_norm": 0.0830078125, "kl": 0.01850617292802781, "learning_rate": 7.3559999999999995e-06, "loss": 0.0007, "num_tokens": 74370809.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 328.8125, "completions/mean_terminated_length": 328.8125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.17099819666914182, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.025823280215263367, "learning_rate": 7.3555999999999995e-06, "loss": 0.087, "num_tokens": 74432435.0, "reward": 3.2880189418792725, "reward_std": 0.9963902831077576, "rewards/reward_fn/mean": 3.2880189418792725, "rewards/reward_fn/std": 0.996390163898468, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 190.53125, "completions/mean_terminated_length": 190.53125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.17110427495491673, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.03128618444316089, "learning_rate": 7.3551999999999995e-06, "loss": 0.0013, "num_tokens": 74462436.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1551.0, "completions/max_terminated_length": 1551.0, "completions/mean_length": 391.59375, "completions/mean_terminated_length": 391.59375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.17121035324069164, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.0252360668964684, "learning_rate": 7.3547999999999994e-06, "loss": 0.0505, "num_tokens": 74505847.0, "reward": 2.924363136291504, "reward_std": 0.3568187355995178, "rewards/reward_fn/mean": 2.924363136291504, "rewards/reward_fn/std": 0.3568187355995178, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1390.0, "completions/max_terminated_length": 1390.0, "completions/mean_length": 415.875, "completions/mean_terminated_length": 415.875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.17131643152646653, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.024169984506443143, "learning_rate": 7.354399999999999e-06, "loss": -0.0145, "num_tokens": 74576595.0, "reward": 2.773521900177002, "reward_std": 0.06824694573879242, "rewards/reward_fn/mean": 2.773521900177002, "rewards/reward_fn/std": 0.06824696063995361, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 470.65625, "completions/mean_terminated_length": 470.65625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.17142250981224144, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.01885635987855494, "learning_rate": 7.353999999999999e-06, "loss": 0.1074, "num_tokens": 74630760.0, "reward": 3.872809886932373, "reward_std": 0.4551088809967041, "rewards/reward_fn/mean": 3.872809886932373, "rewards/reward_fn/std": 0.4551088511943817, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1438.0, "completions/max_terminated_length": 1438.0, "completions/mean_length": 561.8125, "completions/mean_terminated_length": 561.8125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.17152858809801633, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.02392402058467269, "learning_rate": 7.3536e-06, "loss": 0.0666, "num_tokens": 74692962.0, "reward": 2.882366895675659, "reward_std": 0.08499280363321304, "rewards/reward_fn/mean": 2.882366895675659, "rewards/reward_fn/std": 0.08499278873205185, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 185.90625, "completions/mean_terminated_length": 185.90625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.17163466638379124, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.027309770928695798, "learning_rate": 7.3532e-06, "loss": 0.0287, "num_tokens": 74744191.0, "reward": 3.217087507247925, "reward_std": 0.46121397614479065, "rewards/reward_fn/mean": 3.217087507247925, "rewards/reward_fn/std": 0.46121400594711304, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 328.09375, "completions/mean_terminated_length": 328.09375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.17174074466956615, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.02444523060694337, "learning_rate": 7.3528e-06, "loss": 0.0277, "num_tokens": 74776962.0, "reward": 3.580929756164551, "reward_std": 0.706642210483551, "rewards/reward_fn/mean": 3.580929756164551, "rewards/reward_fn/std": 0.7066422700881958, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 232.71875, "completions/mean_terminated_length": 232.71875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.17184682295534104, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.02058199269231409, "learning_rate": 7.3524e-06, "loss": 0.0334, "num_tokens": 74814873.0, "reward": 2.8619747161865234, "reward_std": 0.30958980321884155, "rewards/reward_fn/mean": 2.8619747161865234, "rewards/reward_fn/std": 0.30958983302116394, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1276.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 184.8125, "completions/mean_terminated_length": 184.8125, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.17195290124111595, "frac_reward_zero_std": 0.0, "grad_norm": 2.640625, "kl": 0.03645259817130864, "learning_rate": 7.352e-06, "loss": -0.0075, "num_tokens": 74851827.0, "reward": 3.96586012840271, "reward_std": 0.19312410056591034, "rewards/reward_fn/mean": 3.96586012840271, "rewards/reward_fn/std": 0.19312405586242676, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 295.625, "completions/mean_terminated_length": 295.625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.17205897952689084, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.03457627212628722, "learning_rate": 7.3516e-06, "loss": 0.03, "num_tokens": 74890727.0, "reward": 2.7471697330474854, "reward_std": 0.29337078332901, "rewards/reward_fn/mean": 2.7471697330474854, "rewards/reward_fn/std": 0.2933708131313324, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1086.0, "completions/max_terminated_length": 1086.0, "completions/mean_length": 182.4375, "completions/mean_terminated_length": 182.4375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.17216505781266575, "frac_reward_zero_std": 0.0, "grad_norm": 2.59375, "kl": 0.025542156072333455, "learning_rate": 7.3512e-06, "loss": 0.1191, "num_tokens": 74926453.0, "reward": 3.7256574630737305, "reward_std": 0.7376229763031006, "rewards/reward_fn/mean": 3.7256574630737305, "rewards/reward_fn/std": 0.7376229763031006, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1104.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 266.59375, "completions/mean_terminated_length": 266.59375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.17227113609844064, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.01520739821717143, "learning_rate": 7.350799999999999e-06, "loss": 0.0953, "num_tokens": 74982056.0, "reward": 2.95540714263916, "reward_std": 0.2933211028575897, "rewards/reward_fn/mean": 2.95540714263916, "rewards/reward_fn/std": 0.29332107305526733, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 254.5, "completions/mean_terminated_length": 254.5, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.17237721438421555, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.030610437272116542, "learning_rate": 7.350399999999999e-06, "loss": -0.0358, "num_tokens": 75024792.0, "reward": 1.808734655380249, "reward_std": 0.40144309401512146, "rewards/reward_fn/mean": 1.808734655380249, "rewards/reward_fn/std": 0.4014430642127991, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 326.96875, "completions/mean_terminated_length": 326.96875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.17248329266999046, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.02779210708104074, "learning_rate": 7.349999999999999e-06, "loss": 0.1173, "num_tokens": 75070423.0, "reward": 3.8114876747131348, "reward_std": 0.44545185565948486, "rewards/reward_fn/mean": 3.8114876747131348, "rewards/reward_fn/std": 0.4454518258571625, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1139.0, "completions/max_terminated_length": 1139.0, "completions/mean_length": 374.28125, "completions/mean_terminated_length": 374.28125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.17258937095576535, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.023398922523483634, "learning_rate": 7.349599999999999e-06, "loss": 0.0652, "num_tokens": 75116768.0, "reward": 2.8657479286193848, "reward_std": 0.4419115483760834, "rewards/reward_fn/mean": 2.8657479286193848, "rewards/reward_fn/std": 0.441911518573761, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1205.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 396.71875, "completions/mean_terminated_length": 396.71875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.17269544924154026, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.01942450739443302, "learning_rate": 7.349199999999999e-06, "loss": 0.0472, "num_tokens": 75178231.0, "reward": 3.8060476779937744, "reward_std": 0.5507187247276306, "rewards/reward_fn/mean": 3.8060476779937744, "rewards/reward_fn/std": 0.5507186651229858, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 199.40625, "completions/mean_terminated_length": 199.40625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.17280152752731515, "frac_reward_zero_std": 0.0, "grad_norm": 2.78125, "kl": 0.02440522238612175, "learning_rate": 7.3488e-06, "loss": 0.1618, "num_tokens": 75226660.0, "reward": 3.931492328643799, "reward_std": 0.22422750294208527, "rewards/reward_fn/mean": 3.931492328643799, "rewards/reward_fn/std": 0.22422751784324646, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1092.0, "completions/max_terminated_length": 1092.0, "completions/mean_length": 252.53125, "completions/mean_terminated_length": 252.53125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.17290760581309006, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.02909305295906961, "learning_rate": 7.3484e-06, "loss": -0.2047, "num_tokens": 75274133.0, "reward": 3.0297999382019043, "reward_std": 0.32074329257011414, "rewards/reward_fn/mean": 3.0297999382019043, "rewards/reward_fn/std": 0.3207433819770813, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1720.0, "completions/max_terminated_length": 1720.0, "completions/mean_length": 567.96875, "completions/mean_terminated_length": 567.96875, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.17301368409886497, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.022551232716068625, "learning_rate": 7.348e-06, "loss": 0.118, "num_tokens": 75332532.0, "reward": 2.790672779083252, "reward_std": 0.6563798785209656, "rewards/reward_fn/mean": 2.790672779083252, "rewards/reward_fn/std": 0.6563798785209656, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 79.46875, "completions/mean_terminated_length": 79.46875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.17311976238463986, "frac_reward_zero_std": 1.0, "grad_norm": 0.1328125, "kl": 0.01943877385929227, "learning_rate": 7.3476e-06, "loss": 0.0008, "num_tokens": 75374595.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 257.625, "completions/mean_terminated_length": 257.625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.17322584067041477, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.022583408746868372, "learning_rate": 7.3472e-06, "loss": 0.0228, "num_tokens": 75411671.0, "reward": 2.8476908206939697, "reward_std": 0.033805277198553085, "rewards/reward_fn/mean": 2.8476908206939697, "rewards/reward_fn/std": 0.033805254846811295, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 269.0625, "completions/mean_terminated_length": 269.0625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.17333191895618966, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.018412835197523236, "learning_rate": 7.3468e-06, "loss": 0.0389, "num_tokens": 75462937.0, "reward": 2.860079288482666, "reward_std": 0.06922987848520279, "rewards/reward_fn/mean": 2.860079288482666, "rewards/reward_fn/std": 0.06922990828752518, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 121.65625, "completions/mean_terminated_length": 121.65625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.17343799724196457, "frac_reward_zero_std": 1.0, "grad_norm": 0.1552734375, "kl": 0.03100604098290205, "learning_rate": 7.3464e-06, "loss": 0.0012, "num_tokens": 75501678.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 123.15625, "completions/mean_terminated_length": 123.15625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.17354407552773948, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.02859083702787757, "learning_rate": 7.346e-06, "loss": 0.0011, "num_tokens": 75537587.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 249.875, "completions/mean_terminated_length": 249.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.17365015381351437, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.02171206660568714, "learning_rate": 7.3456e-06, "loss": -0.0201, "num_tokens": 75581391.0, "reward": 3.929537057876587, "reward_std": 0.3985986113548279, "rewards/reward_fn/mean": 3.929537057876587, "rewards/reward_fn/std": 0.39859864115715027, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 135.90625, "completions/mean_terminated_length": 135.90625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.17375623209928928, "frac_reward_zero_std": 1.0, "grad_norm": 0.12890625, "kl": 0.027793261338956654, "learning_rate": 7.3451999999999996e-06, "loss": 0.0011, "num_tokens": 75616940.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 273.09375, "completions/mean_terminated_length": 273.09375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.17386231038506417, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.022352764382958412, "learning_rate": 7.3447999999999995e-06, "loss": 0.0293, "num_tokens": 75670383.0, "reward": 3.888477325439453, "reward_std": 0.352896511554718, "rewards/reward_fn/mean": 3.888477325439453, "rewards/reward_fn/std": 0.352896511554718, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 292.21875, "completions/mean_terminated_length": 292.21875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.17396838867083908, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.014141359482891858, "learning_rate": 7.3443999999999995e-06, "loss": 0.0457, "num_tokens": 75719382.0, "reward": 3.928811550140381, "reward_std": 0.4027021825313568, "rewards/reward_fn/mean": 3.928811550140381, "rewards/reward_fn/std": 0.4027021825313568, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 367.40625, "completions/mean_terminated_length": 313.19354248046875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.174074466956614, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.0231416008900851, "learning_rate": 7.344e-06, "loss": 0.1287, "num_tokens": 75788419.0, "reward": 3.5149707794189453, "reward_std": 1.023628830909729, "rewards/reward_fn/mean": 3.5149707794189453, "rewards/reward_fn/std": 1.023628830909729, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1553.0, "completions/max_terminated_length": 1553.0, "completions/mean_length": 384.71875, "completions/mean_terminated_length": 384.71875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.17418054524238888, "frac_reward_zero_std": 0.0, "grad_norm": 1.3046875, "kl": 0.02502290392294526, "learning_rate": 7.3436e-06, "loss": 0.0406, "num_tokens": 75839354.0, "reward": 2.790564775466919, "reward_std": 0.03924685716629028, "rewards/reward_fn/mean": 2.790564775466919, "rewards/reward_fn/std": 0.0392468199133873, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 202.78125, "completions/mean_terminated_length": 202.78125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.1742866235281638, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.020574510097503662, "learning_rate": 7.3432e-06, "loss": 0.0457, "num_tokens": 75881331.0, "reward": 1.727787971496582, "reward_std": 0.03321034833788872, "rewards/reward_fn/mean": 1.727787971496582, "rewards/reward_fn/std": 0.033210329711437225, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 405.25, "completions/mean_terminated_length": 405.25, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.17439270181393868, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.022814936703070998, "learning_rate": 7.3428e-06, "loss": 0.029, "num_tokens": 75926363.0, "reward": 3.639138698577881, "reward_std": 0.545224666595459, "rewards/reward_fn/mean": 3.639138698577881, "rewards/reward_fn/std": 0.545224666595459, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 204.9375, "completions/mean_terminated_length": 204.9375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.1744987800997136, "frac_reward_zero_std": 1.0, "grad_norm": 0.08837890625, "kl": 0.01477151014842093, "learning_rate": 7.342399999999999e-06, "loss": 0.0006, "num_tokens": 75985561.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 140.0625, "completions/mean_terminated_length": 140.0625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.1746048583854885, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474609375, "kl": 0.02253897808259353, "learning_rate": 7.341999999999999e-06, "loss": 0.0009, "num_tokens": 76027675.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 214.6875, "completions/mean_terminated_length": 214.6875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.1747109366712634, "frac_reward_zero_std": 1.0, "grad_norm": 0.09130859375, "kl": 0.021346024004742503, "learning_rate": 7.341599999999999e-06, "loss": 0.0009, "num_tokens": 76071761.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 214.15625, "completions/mean_terminated_length": 214.15625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.1748170149570383, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.013097033952362835, "learning_rate": 7.341199999999999e-06, "loss": 0.0005, "num_tokens": 76115766.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1412.0, "completions/max_terminated_length": 1412.0, "completions/mean_length": 461.875, "completions/mean_terminated_length": 461.875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.1749230932428132, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.022934141103178263, "learning_rate": 7.340799999999999e-06, "loss": 0.0009, "num_tokens": 76163602.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 137.71875, "completions/mean_terminated_length": 137.71875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.1750291715285881, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.026293474482372403, "learning_rate": 7.340399999999999e-06, "loss": -0.0782, "num_tokens": 76195721.0, "reward": 2.860340118408203, "reward_std": 0.05363810062408447, "rewards/reward_fn/mean": 2.860340118408203, "rewards/reward_fn/std": 0.05363810807466507, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 301.78125, "completions/mean_terminated_length": 301.78125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.175135249814363, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.025250343373045325, "learning_rate": 7.339999999999999e-06, "loss": -0.0705, "num_tokens": 76238914.0, "reward": 3.3528342247009277, "reward_std": 0.5812612771987915, "rewards/reward_fn/mean": 3.3528342247009277, "rewards/reward_fn/std": 0.5812612175941467, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 188.625, "completions/mean_terminated_length": 188.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.1752413281001379, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.02199221565388143, "learning_rate": 7.339599999999999e-06, "loss": 0.0009, "num_tokens": 76281878.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 261.1875, "completions/mean_terminated_length": 261.1875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.1753474063859128, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.025439413264393806, "learning_rate": 7.3392e-06, "loss": 0.0509, "num_tokens": 76320860.0, "reward": 2.9493305683135986, "reward_std": 0.046215642243623734, "rewards/reward_fn/mean": 2.9493305683135986, "rewards/reward_fn/std": 0.04621569812297821, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1948.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 450.1875, "completions/mean_terminated_length": 450.1875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.1754534846716877, "frac_reward_zero_std": 0.0, "grad_norm": 1.421875, "kl": 0.021173911402001977, "learning_rate": 7.3388e-06, "loss": 0.0012, "num_tokens": 76376802.0, "reward": 3.892618417739868, "reward_std": 0.3393568694591522, "rewards/reward_fn/mean": 3.892618417739868, "rewards/reward_fn/std": 0.3393568992614746, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 173.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 113.84375, "completions/mean_terminated_length": 113.84375, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.1755595629574626, "frac_reward_zero_std": 1.0, "grad_norm": 0.1845703125, "kl": 0.030263771768659353, "learning_rate": 7.3384e-06, "loss": 0.0012, "num_tokens": 76405821.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.1756656412432375, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.02956866892054677, "learning_rate": 7.338e-06, "loss": -0.0694, "num_tokens": 76456809.0, "reward": 2.06693172454834, "reward_std": 0.4959835708141327, "rewards/reward_fn/mean": 2.06693172454834, "rewards/reward_fn/std": 0.4959836006164551, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1366.0, "completions/max_terminated_length": 1366.0, "completions/mean_length": 388.28125, "completions/mean_terminated_length": 388.28125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.1757717195290124, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.02950794971548021, "learning_rate": 7.3376e-06, "loss": -0.0245, "num_tokens": 76494290.0, "reward": 3.5781798362731934, "reward_std": 0.5563759207725525, "rewards/reward_fn/mean": 3.5781798362731934, "rewards/reward_fn/std": 0.5563759207725525, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 188.28125, "completions/mean_terminated_length": 188.28125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.17587779781478732, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.01824926841072738, "learning_rate": 7.3372e-06, "loss": 0.0007, "num_tokens": 76539323.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 235.96875, "completions/mean_terminated_length": 235.96875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.1759838761005622, "frac_reward_zero_std": 0.0, "grad_norm": 1.7265625, "kl": 0.025267949560657144, "learning_rate": 7.3368e-06, "loss": 0.0003, "num_tokens": 76581434.0, "reward": 3.967522382736206, "reward_std": 0.18372122943401337, "rewards/reward_fn/mean": 3.967522382736206, "rewards/reward_fn/std": 0.18372122943401337, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 199.46875, "completions/mean_terminated_length": 199.46875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.17608995438633712, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.02116774907335639, "learning_rate": 7.3364e-06, "loss": 0.1391, "num_tokens": 76600041.0, "reward": 2.821526288986206, "reward_std": 0.06851638108491898, "rewards/reward_fn/mean": 2.821526288986206, "rewards/reward_fn/std": 0.06851641088724136, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 342.59375, "completions/mean_terminated_length": 342.59375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.176196032672112, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.02219717751722783, "learning_rate": 7.336e-06, "loss": -0.0077, "num_tokens": 76649180.0, "reward": 3.674363613128662, "reward_std": 0.4910111427307129, "rewards/reward_fn/mean": 3.674363613128662, "rewards/reward_fn/std": 0.4910111427307129, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1291.0, "completions/max_terminated_length": 1291.0, "completions/mean_length": 370.125, "completions/mean_terminated_length": 370.125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.17630211095788692, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.023815440014004707, "learning_rate": 7.3356e-06, "loss": 0.004, "num_tokens": 76700128.0, "reward": 2.9858880043029785, "reward_std": 0.08336754143238068, "rewards/reward_fn/mean": 2.9858880043029785, "rewards/reward_fn/std": 0.08336751163005829, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1206.0, "completions/max_terminated_length": 1206.0, "completions/mean_length": 459.6875, "completions/mean_terminated_length": 459.6875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.17640818924366183, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.01935293711721897, "learning_rate": 7.3352e-06, "loss": -0.2065, "num_tokens": 76758582.0, "reward": 3.0257115364074707, "reward_std": 0.9167580604553223, "rewards/reward_fn/mean": 3.0257115364074707, "rewards/reward_fn/std": 0.9167580008506775, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 320.25, "completions/mean_terminated_length": 320.25, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.17651426752943672, "frac_reward_zero_std": 1.0, "grad_norm": 0.07177734375, "kl": 0.02033980656415224, "learning_rate": 7.3348000000000005e-06, "loss": 0.0008, "num_tokens": 76798270.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 496.09375, "completions/mean_terminated_length": 496.09375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.17662034581521163, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.020821964601054788, "learning_rate": 7.3344e-06, "loss": 0.0254, "num_tokens": 76859617.0, "reward": 3.4188601970672607, "reward_std": 0.522709846496582, "rewards/reward_fn/mean": 3.4188601970672607, "rewards/reward_fn/std": 0.5227097868919373, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 212.25, "completions/mean_terminated_length": 212.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.17672642410098652, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.027814438799396157, "learning_rate": 7.334e-06, "loss": 0.0636, "num_tokens": 76905065.0, "reward": 2.8302299976348877, "reward_std": 0.05470141023397446, "rewards/reward_fn/mean": 2.8302299976348877, "rewards/reward_fn/std": 0.054701436311006546, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1034.0, "completions/max_terminated_length": 1034.0, "completions/mean_length": 355.4375, "completions/mean_terminated_length": 355.4375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.17683250238676143, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.023907755268737674, "learning_rate": 7.3335999999999995e-06, "loss": 0.0154, "num_tokens": 76972471.0, "reward": 3.9634013175964355, "reward_std": 0.20703355967998505, "rewards/reward_fn/mean": 3.9634013175964355, "rewards/reward_fn/std": 0.20703357458114624, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 216.09375, "completions/mean_terminated_length": 216.09375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.17693858067253634, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.03942023706622422, "learning_rate": 7.3331999999999995e-06, "loss": 0.0491, "num_tokens": 77002138.0, "reward": 3.0410289764404297, "reward_std": 0.03184741735458374, "rewards/reward_fn/mean": 3.0410289764404297, "rewards/reward_fn/std": 0.03184738755226135, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 390.9375, "completions/mean_terminated_length": 390.9375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.17704465895831123, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.023845213698223233, "learning_rate": 7.3327999999999995e-06, "loss": 0.0154, "num_tokens": 77068280.0, "reward": 3.8936331272125244, "reward_std": 0.4476320743560791, "rewards/reward_fn/mean": 3.8936331272125244, "rewards/reward_fn/std": 0.4476320147514343, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 432.8125, "completions/mean_terminated_length": 380.70965576171875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.17715073724408614, "frac_reward_zero_std": 0.0, "grad_norm": 0.67578125, "kl": 0.019716886803507805, "learning_rate": 7.3323999999999995e-06, "loss": 0.2615, "num_tokens": 77121362.0, "reward": 2.792949676513672, "reward_std": 0.5104948282241821, "rewards/reward_fn/mean": 2.792949676513672, "rewards/reward_fn/std": 0.5104948878288269, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 154.0625, "completions/mean_terminated_length": 154.0625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.17725681552986103, "frac_reward_zero_std": 1.0, "grad_norm": 0.1474609375, "kl": 0.022587708896026015, "learning_rate": 7.3319999999999994e-06, "loss": 0.0009, "num_tokens": 77158388.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 195.28125, "completions/mean_terminated_length": 195.28125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.17736289381563594, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.023013029946014285, "learning_rate": 7.331599999999999e-06, "loss": -0.0029, "num_tokens": 77195357.0, "reward": 2.860653877258301, "reward_std": 0.23699697852134705, "rewards/reward_fn/mean": 2.860653877258301, "rewards/reward_fn/std": 0.23699693381786346, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 163.375, "completions/mean_terminated_length": 163.375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.17746897210141085, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.021371350390836596, "learning_rate": 7.331199999999999e-06, "loss": 0.0009, "num_tokens": 77235081.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 304.4375, "completions/mean_terminated_length": 304.4375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.17757505038718574, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.023021605564281344, "learning_rate": 7.330799999999999e-06, "loss": 0.043, "num_tokens": 77280023.0, "reward": 2.8056857585906982, "reward_std": 0.30699968338012695, "rewards/reward_fn/mean": 2.8056857585906982, "rewards/reward_fn/std": 0.30699968338012695, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1218.0, "completions/mean_length": 732.71875, "completions/mean_terminated_length": 690.290283203125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.17768112867296065, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.019517116714268923, "learning_rate": 7.330399999999999e-06, "loss": 0.1688, "num_tokens": 77344622.0, "reward": 2.619640588760376, "reward_std": 0.5425283312797546, "rewards/reward_fn/mean": 2.619640588760376, "rewards/reward_fn/std": 0.5425283312797546, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 181.34375, "completions/mean_terminated_length": 181.34375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.17778720695873554, "frac_reward_zero_std": 1.0, "grad_norm": 0.11376953125, "kl": 0.024739502929151058, "learning_rate": 7.33e-06, "loss": 0.001, "num_tokens": 77377721.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 100.5, "completions/mean_terminated_length": 100.5, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.17789328524451045, "frac_reward_zero_std": 1.0, "grad_norm": 0.142578125, "kl": 0.026414725929498672, "learning_rate": 7.3296e-06, "loss": 0.0011, "num_tokens": 77404681.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1167.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 275.75, "completions/mean_terminated_length": 275.75, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.17799936353028534, "frac_reward_zero_std": 1.0, "grad_norm": 0.11083984375, "kl": 0.02964741620235145, "learning_rate": 7.3292e-06, "loss": 0.0012, "num_tokens": 77443233.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 253.375, "completions/mean_terminated_length": 253.375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.17810544181606025, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.02185642090626061, "learning_rate": 7.3288e-06, "loss": 0.0958, "num_tokens": 77487885.0, "reward": 3.9379520416259766, "reward_std": 0.2442624717950821, "rewards/reward_fn/mean": 3.9379520416259766, "rewards/reward_fn/std": 0.2442624568939209, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1404.0, "completions/max_terminated_length": 1404.0, "completions/mean_length": 439.84375, "completions/mean_terminated_length": 439.84375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.17821152010183516, "frac_reward_zero_std": 0.0, "grad_norm": 1.4609375, "kl": 0.022376101464033127, "learning_rate": 7.3284e-06, "loss": -0.1057, "num_tokens": 77546440.0, "reward": 2.7930126190185547, "reward_std": 0.36263859272003174, "rewards/reward_fn/mean": 2.7930126190185547, "rewards/reward_fn/std": 0.36263859272003174, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1977.0, "completions/max_terminated_length": 1977.0, "completions/mean_length": 513.40625, "completions/mean_terminated_length": 513.40625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.17831759838761005, "frac_reward_zero_std": 0.0, "grad_norm": 1.2265625, "kl": 0.03152905683964491, "learning_rate": 7.328e-06, "loss": -0.087, "num_tokens": 77591349.0, "reward": 2.629180431365967, "reward_std": 0.29072603583335876, "rewards/reward_fn/mean": 2.629180431365967, "rewards/reward_fn/std": 0.29072603583335876, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 461.46875, "completions/mean_terminated_length": 461.46875, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.17842367667338496, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.02138684457167983, "learning_rate": 7.3276e-06, "loss": 0.1139, "num_tokens": 77646148.0, "reward": 3.8168039321899414, "reward_std": 0.5191534161567688, "rewards/reward_fn/mean": 3.8168039321899414, "rewards/reward_fn/std": 0.5191534161567688, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 226.09375, "completions/mean_terminated_length": 226.09375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.17852975495915985, "frac_reward_zero_std": 0.0, "grad_norm": 2.390625, "kl": 0.025132108945399523, "learning_rate": 7.3272e-06, "loss": 0.0778, "num_tokens": 77708775.0, "reward": 2.9388680458068848, "reward_std": 0.24276258051395416, "rewards/reward_fn/mean": 2.9388680458068848, "rewards/reward_fn/std": 0.2427625209093094, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 357.28125, "completions/mean_terminated_length": 357.28125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.17863583324493476, "frac_reward_zero_std": 1.0, "grad_norm": 0.07275390625, "kl": 0.018456691992469132, "learning_rate": 7.3268e-06, "loss": 0.0007, "num_tokens": 77758864.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 75.53125, "completions/mean_terminated_length": 75.53125, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.17874191153070967, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.010399193910416216, "learning_rate": 7.326399999999999e-06, "loss": 0.0004, "num_tokens": 77786081.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 170.0, "completions/max_terminated_length": 170.0, "completions/mean_length": 126.03125, "completions/mean_terminated_length": 126.03125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.17884798981648456, "frac_reward_zero_std": 1.0, "grad_norm": 0.115234375, "kl": 0.0217991154640913, "learning_rate": 7.325999999999999e-06, "loss": 0.0009, "num_tokens": 77822306.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1886.0, "completions/max_terminated_length": 1886.0, "completions/mean_length": 581.40625, "completions/mean_terminated_length": 581.40625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.17895406810225947, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.025571937672793865, "learning_rate": 7.325599999999999e-06, "loss": -0.0045, "num_tokens": 77876303.0, "reward": 2.0563902854919434, "reward_std": 0.5362445712089539, "rewards/reward_fn/mean": 2.0563902854919434, "rewards/reward_fn/std": 0.5362445712089539, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 267.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.17906014638803436, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.027823872631415725, "learning_rate": 7.3252e-06, "loss": 0.0438, "num_tokens": 77919531.0, "reward": 3.966395854949951, "reward_std": 0.19009362161159515, "rewards/reward_fn/mean": 3.966395854949951, "rewards/reward_fn/std": 0.19009362161159515, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 307.46875, "completions/mean_terminated_length": 307.46875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.17916622467380927, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.02866238821297884, "learning_rate": 7.3248e-06, "loss": -0.063, "num_tokens": 77975002.0, "reward": 3.857930898666382, "reward_std": 0.559036374092102, "rewards/reward_fn/mean": 3.857930898666382, "rewards/reward_fn/std": 0.5590363144874573, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 193.375, "completions/mean_terminated_length": 193.375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.17927230295958418, "frac_reward_zero_std": 1.0, "grad_norm": 0.10205078125, "kl": 0.02129594807047397, "learning_rate": 7.3244e-06, "loss": 0.0009, "num_tokens": 78026790.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 240.96875, "completions/mean_terminated_length": 240.96875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.17937838124535907, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.020590995671227574, "learning_rate": 7.324e-06, "loss": 0.1261, "num_tokens": 78058405.0, "reward": 2.8747594356536865, "reward_std": 0.06020258367061615, "rewards/reward_fn/mean": 2.8747594356536865, "rewards/reward_fn/std": 0.06020255759358406, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 295.15625, "completions/mean_terminated_length": 295.15625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.17948445953113398, "frac_reward_zero_std": 0.0, "grad_norm": 1.8828125, "kl": 0.025204677367582917, "learning_rate": 7.3236e-06, "loss": 0.0591, "num_tokens": 78102698.0, "reward": 3.822904348373413, "reward_std": 0.41878360509872437, "rewards/reward_fn/mean": 3.822904348373413, "rewards/reward_fn/std": 0.418783575296402, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 159.40625, "completions/mean_terminated_length": 159.40625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.17959053781690887, "frac_reward_zero_std": 0.0, "grad_norm": 2.625, "kl": 0.03215904091484845, "learning_rate": 7.3232e-06, "loss": 0.1193, "num_tokens": 78143831.0, "reward": 2.951641321182251, "reward_std": 0.02703152783215046, "rewards/reward_fn/mean": 2.951641321182251, "rewards/reward_fn/std": 0.02703148126602173, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 535.96875, "completions/mean_terminated_length": 435.16668701171875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.17969661610268378, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.025029084412381053, "learning_rate": 7.3228e-06, "loss": 0.3638, "num_tokens": 78197942.0, "reward": 2.569448471069336, "reward_std": 0.7918709516525269, "rewards/reward_fn/mean": 2.569448471069336, "rewards/reward_fn/std": 0.7918709516525269, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 249.1875, "completions/mean_terminated_length": 249.1875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.1798026943884587, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.028609147295355797, "learning_rate": 7.3223999999999996e-06, "loss": 0.2358, "num_tokens": 78249308.0, "reward": 3.925804615020752, "reward_std": 0.2920267581939697, "rewards/reward_fn/mean": 3.925804615020752, "rewards/reward_fn/std": 0.2920267581939697, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1599.0, "completions/max_terminated_length": 1599.0, "completions/mean_length": 509.09375, "completions/mean_terminated_length": 509.09375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.17990877267423358, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.01797068677842617, "learning_rate": 7.3219999999999995e-06, "loss": 0.0474, "num_tokens": 78312799.0, "reward": 2.6847097873687744, "reward_std": 0.18323646485805511, "rewards/reward_fn/mean": 2.6847097873687744, "rewards/reward_fn/std": 0.18323642015457153, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 176.28125, "completions/mean_terminated_length": 176.28125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.1800148509600085, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.026832085801288486, "learning_rate": 7.3215999999999995e-06, "loss": 0.1148, "num_tokens": 78338376.0, "reward": 2.6626052856445312, "reward_std": 0.06326717883348465, "rewards/reward_fn/mean": 2.6626052856445312, "rewards/reward_fn/std": 0.06326717138290405, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 948.0, "completions/max_terminated_length": 948.0, "completions/mean_length": 326.28125, "completions/mean_terminated_length": 326.28125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.18012092924578338, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.023031077347695827, "learning_rate": 7.3211999999999995e-06, "loss": 0.0221, "num_tokens": 78392721.0, "reward": 3.9276814460754395, "reward_std": 0.40909603238105774, "rewards/reward_fn/mean": 3.9276814460754395, "rewards/reward_fn/std": 0.40909600257873535, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 219.40625, "completions/mean_terminated_length": 219.40625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.1802270075315583, "frac_reward_zero_std": 0.0, "grad_norm": 2.375, "kl": 0.02422755560837686, "learning_rate": 7.3208e-06, "loss": 0.0462, "num_tokens": 78431102.0, "reward": 3.896008014678955, "reward_std": 0.3290996253490448, "rewards/reward_fn/mean": 3.896008014678955, "rewards/reward_fn/std": 0.3290995657444, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 383.46875, "completions/mean_terminated_length": 272.5, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.1803330858173332, "frac_reward_zero_std": 0.0, "grad_norm": 0.98046875, "kl": 0.018242677440866828, "learning_rate": 7.3204e-06, "loss": 0.3524, "num_tokens": 78500333.0, "reward": 3.309962511062622, "reward_std": 1.0754379034042358, "rewards/reward_fn/mean": 3.309962511062622, "rewards/reward_fn/std": 1.0754379034042358, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 347.96875, "completions/mean_terminated_length": 347.96875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.1804391641031081, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.016349896206520498, "learning_rate": 7.32e-06, "loss": 0.1304, "num_tokens": 78567532.0, "reward": 3.853870153427124, "reward_std": 0.5750675797462463, "rewards/reward_fn/mean": 3.853870153427124, "rewards/reward_fn/std": 0.5750675797462463, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 168.375, "completions/mean_terminated_length": 168.375, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.180545242388883, "frac_reward_zero_std": 1.0, "grad_norm": 0.09814453125, "kl": 0.025941829895600677, "learning_rate": 7.3196e-06, "loss": 0.001, "num_tokens": 78615864.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 275.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.1806513206746579, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.02063106745481491, "learning_rate": 7.3192e-06, "loss": 0.0008, "num_tokens": 78639100.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 210.1875, "completions/mean_terminated_length": 210.1875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.1807573989604328, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.024305471451953053, "learning_rate": 7.3188e-06, "loss": 0.0736, "num_tokens": 78684962.0, "reward": 3.971139907836914, "reward_std": 0.16325806081295013, "rewards/reward_fn/mean": 3.971139907836914, "rewards/reward_fn/std": 0.16325809061527252, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 321.625, "completions/mean_terminated_length": 321.625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.1808634772462077, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.021618828177452087, "learning_rate": 7.318399999999999e-06, "loss": 0.0133, "num_tokens": 78735606.0, "reward": 3.964750289916992, "reward_std": 0.1994020640850067, "rewards/reward_fn/mean": 3.964750289916992, "rewards/reward_fn/std": 0.19940204918384552, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 194.53125, "completions/mean_terminated_length": 194.53125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.1809695555319826, "frac_reward_zero_std": 1.0, "grad_norm": 0.0908203125, "kl": 0.017645681044086814, "learning_rate": 7.317999999999999e-06, "loss": 0.0007, "num_tokens": 78767687.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 162.25, "completions/mean_terminated_length": 162.25, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.18107563381775751, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.021362949744798243, "learning_rate": 7.317599999999999e-06, "loss": 0.0009, "num_tokens": 78815471.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 240.90625, "completions/mean_terminated_length": 240.90625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.1811817121035324, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.021373329684138298, "learning_rate": 7.317199999999999e-06, "loss": -0.0466, "num_tokens": 78869164.0, "reward": 1.739349365234375, "reward_std": 0.4158928692340851, "rewards/reward_fn/mean": 1.739349365234375, "rewards/reward_fn/std": 0.4158928096294403, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1127.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 254.40625, "completions/mean_terminated_length": 254.40625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.1812877903893073, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.022718794643878937, "learning_rate": 7.316799999999999e-06, "loss": -0.0496, "num_tokens": 78913561.0, "reward": 3.9045119285583496, "reward_std": 0.30215689539909363, "rewards/reward_fn/mean": 3.9045119285583496, "rewards/reward_fn/std": 0.30215683579444885, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 786.8125, "completions/mean_terminated_length": 746.1290283203125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.1813938686750822, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.020097526721656322, "learning_rate": 7.316399999999999e-06, "loss": 0.1906, "num_tokens": 78967091.0, "reward": 2.3898777961730957, "reward_std": 0.6157297492027283, "rewards/reward_fn/mean": 2.3898777961730957, "rewards/reward_fn/std": 0.6157297492027283, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1443.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 439.78125, "completions/mean_terminated_length": 439.78125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.1814999469608571, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.025020675268024206, "learning_rate": 7.316e-06, "loss": 0.007, "num_tokens": 78999244.0, "reward": 3.8277320861816406, "reward_std": 0.571384847164154, "rewards/reward_fn/mean": 3.8277320861816406, "rewards/reward_fn/std": 0.571384847164154, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 107.59375, "completions/mean_terminated_length": 107.59375, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.18160602524663202, "frac_reward_zero_std": 1.0, "grad_norm": 0.10595703125, "kl": 0.014298198861069977, "learning_rate": 7.3156e-06, "loss": 0.0006, "num_tokens": 79037471.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1139.0, "completions/max_terminated_length": 1139.0, "completions/mean_length": 489.375, "completions/mean_terminated_length": 489.375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.1817121035324069, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.019865088164806366, "learning_rate": 7.3152e-06, "loss": -0.0399, "num_tokens": 79087947.0, "reward": 3.5038528442382812, "reward_std": 0.6733208298683167, "rewards/reward_fn/mean": 3.5038528442382812, "rewards/reward_fn/std": 0.6733208298683167, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 738.0625, "completions/mean_terminated_length": 650.7333374023438, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.18181818181818182, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.0224718798417598, "learning_rate": 7.3148e-06, "loss": 0.1838, "num_tokens": 79141901.0, "reward": 2.2625229358673096, "reward_std": 0.8606188297271729, "rewards/reward_fn/mean": 2.2625229358673096, "rewards/reward_fn/std": 0.8606187105178833, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 270.8125, "completions/mean_terminated_length": 270.8125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.1819242601039567, "frac_reward_zero_std": 1.0, "grad_norm": 0.1005859375, "kl": 0.024445211980491877, "learning_rate": 7.3144e-06, "loss": 0.001, "num_tokens": 79185863.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1562.0, "completions/max_terminated_length": 1562.0, "completions/mean_length": 340.28125, "completions/mean_terminated_length": 340.28125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.18203033838973162, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.02192999073304236, "learning_rate": 7.314e-06, "loss": -0.0481, "num_tokens": 79243696.0, "reward": 3.75516676902771, "reward_std": 0.470197856426239, "rewards/reward_fn/mean": 3.75516676902771, "rewards/reward_fn/std": 0.470197856426239, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 412.375, "completions/mean_terminated_length": 412.375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.18213641667550654, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.017105724662542343, "learning_rate": 7.3136e-06, "loss": 0.0388, "num_tokens": 79273980.0, "reward": 3.72775936126709, "reward_std": 0.4455103576183319, "rewards/reward_fn/mean": 3.72775936126709, "rewards/reward_fn/std": 0.4455103576183319, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 374.875, "completions/mean_terminated_length": 374.875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.18224249496128142, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.01594231475610286, "learning_rate": 7.3132e-06, "loss": 0.0312, "num_tokens": 79324824.0, "reward": 2.8251233100891113, "reward_std": 0.0379062183201313, "rewards/reward_fn/mean": 2.8251233100891113, "rewards/reward_fn/std": 0.03790617734193802, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 228.09375, "completions/mean_terminated_length": 228.09375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.18234857324705633, "frac_reward_zero_std": 0.0, "grad_norm": 1.6796875, "kl": 0.02496536774560809, "learning_rate": 7.3128e-06, "loss": 0.0136, "num_tokens": 79382683.0, "reward": 3.88787579536438, "reward_std": 0.4645228385925293, "rewards/reward_fn/mean": 3.88787579536438, "rewards/reward_fn/std": 0.4645228087902069, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1386.0, "completions/max_terminated_length": 1386.0, "completions/mean_length": 340.71875, "completions/mean_terminated_length": 340.71875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.18245465153283122, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.02118815656285733, "learning_rate": 7.3124e-06, "loss": 0.0761, "num_tokens": 79439858.0, "reward": 2.998422622680664, "reward_std": 0.06069687008857727, "rewards/reward_fn/mean": 2.998422622680664, "rewards/reward_fn/std": 0.060696884989738464, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1661.0, "completions/max_terminated_length": 1661.0, "completions/mean_length": 414.15625, "completions/mean_terminated_length": 414.15625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.18256072981860613, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.029617647174745798, "learning_rate": 7.312e-06, "loss": 0.0364, "num_tokens": 79483319.0, "reward": 2.8089561462402344, "reward_std": 0.05686548724770546, "rewards/reward_fn/mean": 2.8089561462402344, "rewards/reward_fn/std": 0.05686549097299576, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 184.8125, "completions/mean_terminated_length": 184.8125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.18266680810438105, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.020757826045155525, "learning_rate": 7.3116e-06, "loss": -0.0682, "num_tokens": 79523601.0, "reward": 2.8676366806030273, "reward_std": 0.06009431183338165, "rewards/reward_fn/mean": 2.8676366806030273, "rewards/reward_fn/std": 0.060094304382801056, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 275.34375, "completions/mean_terminated_length": 275.34375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.18277288639015593, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.02504241978749633, "learning_rate": 7.3112e-06, "loss": -0.0056, "num_tokens": 79566236.0, "reward": 2.6496548652648926, "reward_std": 0.2010585367679596, "rewards/reward_fn/mean": 2.6496548652648926, "rewards/reward_fn/std": 0.2010585218667984, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1770.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 679.5, "completions/mean_terminated_length": 679.5, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.18287896467593084, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.019714355003088713, "learning_rate": 7.3108e-06, "loss": 0.1588, "num_tokens": 79649772.0, "reward": 3.8880138397216797, "reward_std": 0.4623366594314575, "rewards/reward_fn/mean": 3.8880138397216797, "rewards/reward_fn/std": 0.4623366594314575, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 177.46875, "completions/mean_terminated_length": 177.46875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.18298504296170573, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.019116016919724643, "learning_rate": 7.3103999999999995e-06, "loss": -0.0512, "num_tokens": 79694267.0, "reward": 3.8662519454956055, "reward_std": 0.5263176560401917, "rewards/reward_fn/mean": 3.8662519454956055, "rewards/reward_fn/std": 0.5263176560401917, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1104.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 382.09375, "completions/mean_terminated_length": 382.09375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.18309112124748064, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.017504334566183388, "learning_rate": 7.3099999999999995e-06, "loss": 0.0785, "num_tokens": 79748798.0, "reward": 3.590195655822754, "reward_std": 0.575702428817749, "rewards/reward_fn/mean": 3.590195655822754, "rewards/reward_fn/std": 0.5757024884223938, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 129.59375, "completions/mean_terminated_length": 129.59375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.18319719953325556, "frac_reward_zero_std": 1.0, "grad_norm": 0.068359375, "kl": 0.012138587655499578, "learning_rate": 7.3095999999999994e-06, "loss": 0.0005, "num_tokens": 79783569.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 294.65625, "completions/mean_terminated_length": 294.65625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.18330327781903044, "frac_reward_zero_std": 1.0, "grad_norm": 0.07568359375, "kl": 0.021464786026626825, "learning_rate": 7.309199999999999e-06, "loss": 0.0009, "num_tokens": 79829862.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1002.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 289.34375, "completions/mean_terminated_length": 289.34375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.18340935610480535, "frac_reward_zero_std": 0.0, "grad_norm": 1.9921875, "kl": 0.032017044024541974, "learning_rate": 7.308799999999999e-06, "loss": 0.1641, "num_tokens": 79871569.0, "reward": 3.642909288406372, "reward_std": 0.6748332977294922, "rewards/reward_fn/mean": 3.642909288406372, "rewards/reward_fn/std": 0.6748332977294922, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1716.0, "completions/mean_length": 1106.5625, "completions/mean_terminated_length": 889.3077392578125, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.18351543439058024, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.01772727456409484, "learning_rate": 7.308399999999999e-06, "loss": 0.3112, "num_tokens": 79935683.0, "reward": 2.0223124027252197, "reward_std": 1.02803635597229, "rewards/reward_fn/mean": 2.0223124027252197, "rewards/reward_fn/std": 1.02803635597229, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 178.65625, "completions/mean_terminated_length": 178.65625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.18362151267635515, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.021037452504970133, "learning_rate": 7.307999999999999e-06, "loss": 0.0008, "num_tokens": 79976280.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 354.9375, "completions/mean_terminated_length": 354.9375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.18372759096213004, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.030045943101868033, "learning_rate": 7.307599999999999e-06, "loss": 0.0632, "num_tokens": 80034390.0, "reward": 3.2148056030273438, "reward_std": 0.7227078080177307, "rewards/reward_fn/mean": 3.2148056030273438, "rewards/reward_fn/std": 0.7227078080177307, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 293.21875, "completions/mean_terminated_length": 293.21875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.18383366924790495, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.01632210414391011, "learning_rate": 7.307199999999999e-06, "loss": 0.0775, "num_tokens": 80096061.0, "reward": 3.0696334838867188, "reward_std": 0.03446883708238602, "rewards/reward_fn/mean": 3.0696334838867188, "rewards/reward_fn/std": 0.034468866884708405, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 285.28125, "completions/mean_terminated_length": 285.28125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.18393974753367986, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.021304041147232056, "learning_rate": 7.306799999999999e-06, "loss": -0.0633, "num_tokens": 80153734.0, "reward": 3.1486945152282715, "reward_std": 0.6498160362243652, "rewards/reward_fn/mean": 3.1486945152282715, "rewards/reward_fn/std": 0.6498160362243652, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1056.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 246.09375, "completions/mean_terminated_length": 246.09375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.18404582581945475, "frac_reward_zero_std": 0.0, "grad_norm": 1.6171875, "kl": 0.02485991013236344, "learning_rate": 7.3064e-06, "loss": -0.0315, "num_tokens": 80200777.0, "reward": 3.749239444732666, "reward_std": 0.4819631576538086, "rewards/reward_fn/mean": 3.749239444732666, "rewards/reward_fn/std": 0.4819631576538086, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1065.0, "completions/max_terminated_length": 1065.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.18415190410522966, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.023410562425851822, "learning_rate": 7.306e-06, "loss": 0.0159, "num_tokens": 80245261.0, "reward": 3.9407196044921875, "reward_std": 0.2346184253692627, "rewards/reward_fn/mean": 3.9407196044921875, "rewards/reward_fn/std": 0.2346184253692627, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1550.0, "completions/max_terminated_length": 1550.0, "completions/mean_length": 397.46875, "completions/mean_terminated_length": 397.46875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.18425798239100455, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.026979970512911677, "learning_rate": 7.3056e-06, "loss": -0.0341, "num_tokens": 80297532.0, "reward": 3.124235153198242, "reward_std": 0.7219064831733704, "rewards/reward_fn/mean": 3.124235153198242, "rewards/reward_fn/std": 0.7219064831733704, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 326.0, "completions/mean_terminated_length": 326.0, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.18436406067677946, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.03350239293649793, "learning_rate": 7.3052e-06, "loss": -0.0451, "num_tokens": 80336412.0, "reward": 2.874659299850464, "reward_std": 0.19675695896148682, "rewards/reward_fn/mean": 2.874659299850464, "rewards/reward_fn/std": 0.19675692915916443, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1408.0, "completions/max_terminated_length": 1408.0, "completions/mean_length": 400.5, "completions/mean_terminated_length": 400.5, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.18447013896255438, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.021226889453828335, "learning_rate": 7.3048e-06, "loss": 0.0089, "num_tokens": 80377772.0, "reward": 2.802847385406494, "reward_std": 0.04565891623497009, "rewards/reward_fn/mean": 2.802847385406494, "rewards/reward_fn/std": 0.045658860355615616, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 845.0, "completions/max_terminated_length": 845.0, "completions/mean_length": 396.5, "completions/mean_terminated_length": 396.5, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.18457621724832926, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.02231617399957031, "learning_rate": 7.3044e-06, "loss": -0.1365, "num_tokens": 80411356.0, "reward": 3.0051422119140625, "reward_std": 0.4885551333427429, "rewards/reward_fn/mean": 3.0051422119140625, "rewards/reward_fn/std": 0.4885551631450653, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1446.0, "completions/max_terminated_length": 1446.0, "completions/mean_length": 328.15625, "completions/mean_terminated_length": 328.15625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.18468229553410417, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.02340529253706336, "learning_rate": 7.304e-06, "loss": 0.0033, "num_tokens": 80457441.0, "reward": 3.927124261856079, "reward_std": 0.41224780678749084, "rewards/reward_fn/mean": 3.927124261856079, "rewards/reward_fn/std": 0.41224780678749084, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1631.0, "completions/max_terminated_length": 1631.0, "completions/mean_length": 397.21875, "completions/mean_terminated_length": 397.21875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.18478837381987906, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.024748707422986627, "learning_rate": 7.3036e-06, "loss": 0.0071, "num_tokens": 80493704.0, "reward": 2.719790458679199, "reward_std": 0.18041294813156128, "rewards/reward_fn/mean": 2.719790458679199, "rewards/reward_fn/std": 0.18041293323040009, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 198.84375, "completions/mean_terminated_length": 198.84375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.18489445210565397, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.025478441501036286, "learning_rate": 7.3032e-06, "loss": 0.001, "num_tokens": 80528579.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1338.0, "completions/max_terminated_length": 1338.0, "completions/mean_length": 317.625, "completions/mean_terminated_length": 317.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.18500053039142889, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.023378330282866955, "learning_rate": 7.3028e-06, "loss": -0.0361, "num_tokens": 80576375.0, "reward": 3.9293417930603027, "reward_std": 0.27804508805274963, "rewards/reward_fn/mean": 3.9293417930603027, "rewards/reward_fn/std": 0.278045117855072, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1164.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 316.71875, "completions/mean_terminated_length": 316.71875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.18510660867720377, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.015432185959070921, "learning_rate": 7.302399999999999e-06, "loss": 0.0778, "num_tokens": 80629230.0, "reward": 2.648890972137451, "reward_std": 0.05023077875375748, "rewards/reward_fn/mean": 2.648890972137451, "rewards/reward_fn/std": 0.050230756402015686, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 192.3125, "completions/mean_terminated_length": 192.3125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.18521268696297868, "frac_reward_zero_std": 1.0, "grad_norm": 0.099609375, "kl": 0.028556083096191287, "learning_rate": 7.301999999999999e-06, "loss": 0.0011, "num_tokens": 80658936.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 510.15625, "completions/mean_terminated_length": 510.15625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.18531876524875357, "frac_reward_zero_std": 0.0, "grad_norm": 1.046875, "kl": 0.02161628007888794, "learning_rate": 7.3016e-06, "loss": 0.0176, "num_tokens": 80715997.0, "reward": 2.8921079635620117, "reward_std": 0.3677787184715271, "rewards/reward_fn/mean": 2.8921079635620117, "rewards/reward_fn/std": 0.3677787482738495, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 191.84375, "completions/mean_terminated_length": 191.84375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.18542484353452848, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.029479774879291654, "learning_rate": 7.3012e-06, "loss": -0.0478, "num_tokens": 80761304.0, "reward": 3.9663615226745605, "reward_std": 0.19028803706169128, "rewards/reward_fn/mean": 3.9663615226745605, "rewards/reward_fn/std": 0.19028803706169128, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 340.59375, "completions/mean_terminated_length": 340.59375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.1855309218203034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0791015625, "kl": 0.025199016323313117, "learning_rate": 7.3008e-06, "loss": 0.001, "num_tokens": 80808139.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 221.4375, "completions/mean_terminated_length": 221.4375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.18563700010607828, "frac_reward_zero_std": 0.0, "grad_norm": 2.546875, "kl": 0.02979375934228301, "learning_rate": 7.3004e-06, "loss": 0.1565, "num_tokens": 80865401.0, "reward": 3.743729591369629, "reward_std": 0.6451296210289001, "rewards/reward_fn/mean": 3.743729591369629, "rewards/reward_fn/std": 0.6451296210289001, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 196.0, "completions/mean_terminated_length": 196.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.1857430783918532, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.04355457844212651, "learning_rate": 7.2999999999999996e-06, "loss": -0.0124, "num_tokens": 80906489.0, "reward": 3.9675936698913574, "reward_std": 0.18331791460514069, "rewards/reward_fn/mean": 3.9675936698913574, "rewards/reward_fn/std": 0.18331791460514069, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 537.4375, "completions/mean_terminated_length": 537.4375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.18584915667762808, "frac_reward_zero_std": 0.0, "grad_norm": 1.1953125, "kl": 0.01735016261227429, "learning_rate": 7.2995999999999995e-06, "loss": 0.0577, "num_tokens": 80971911.0, "reward": 2.747972249984741, "reward_std": 0.04789024218916893, "rewards/reward_fn/mean": 2.747972249984741, "rewards/reward_fn/std": 0.04789023473858833, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 360.625, "completions/mean_terminated_length": 360.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.185955234963403, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.016449951217509806, "learning_rate": 7.2991999999999995e-06, "loss": 0.0972, "num_tokens": 81028475.0, "reward": 3.906682252883911, "reward_std": 0.29498347640037537, "rewards/reward_fn/mean": 3.906682252883911, "rewards/reward_fn/std": 0.29498350620269775, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 197.6875, "completions/mean_terminated_length": 197.6875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.1860613132491779, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.024210747331380844, "learning_rate": 7.2987999999999995e-06, "loss": 0.044, "num_tokens": 81070289.0, "reward": 2.903296947479248, "reward_std": 0.4796822965145111, "rewards/reward_fn/mean": 2.903296947479248, "rewards/reward_fn/std": 0.4796823263168335, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1133.0, "completions/max_terminated_length": 1133.0, "completions/mean_length": 335.09375, "completions/mean_terminated_length": 335.09375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.1861673915349528, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.023606545757502317, "learning_rate": 7.2983999999999994e-06, "loss": 0.0337, "num_tokens": 81119316.0, "reward": 3.511998176574707, "reward_std": 0.6018990278244019, "rewards/reward_fn/mean": 3.511998176574707, "rewards/reward_fn/std": 0.6018990278244019, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 1084.625, "completions/mean_terminated_length": 1020.4000244140625, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.1862734698207277, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.01310090278275311, "learning_rate": 7.297999999999999e-06, "loss": 0.1005, "num_tokens": 81194376.0, "reward": 3.3202013969421387, "reward_std": 1.109673261642456, "rewards/reward_fn/mean": 3.3202013969421387, "rewards/reward_fn/std": 1.109673261642456, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 269.96875, "completions/mean_terminated_length": 269.96875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.1863795481065026, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.02232472668401897, "learning_rate": 7.297599999999999e-06, "loss": -0.0044, "num_tokens": 81246823.0, "reward": 2.9757986068725586, "reward_std": 0.21454162895679474, "rewards/reward_fn/mean": 2.9757986068725586, "rewards/reward_fn/std": 0.21454162895679474, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 238.03125, "completions/mean_terminated_length": 238.03125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.1864856263922775, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.02410236350260675, "learning_rate": 7.2972e-06, "loss": 0.0092, "num_tokens": 81283624.0, "reward": 3.3134825229644775, "reward_std": 0.43799471855163574, "rewards/reward_fn/mean": 3.3134825229644775, "rewards/reward_fn/std": 0.43799474835395813, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 126.21875, "completions/mean_terminated_length": 126.21875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.1865917046780524, "frac_reward_zero_std": 0.0, "grad_norm": 2.359375, "kl": 0.019084823317825794, "learning_rate": 7.2968e-06, "loss": 0.1121, "num_tokens": 81319631.0, "reward": 2.9211864471435547, "reward_std": 0.029545731842517853, "rewards/reward_fn/mean": 2.9211864471435547, "rewards/reward_fn/std": 0.0295457411557436, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1060.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 270.9375, "completions/mean_terminated_length": 270.9375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.1866977829638273, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.02463018544949591, "learning_rate": 7.2964e-06, "loss": 0.1919, "num_tokens": 81365325.0, "reward": 3.0176355838775635, "reward_std": 0.3355969190597534, "rewards/reward_fn/mean": 3.0176355838775635, "rewards/reward_fn/std": 0.3355969786643982, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 180.40625, "completions/mean_terminated_length": 180.40625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.18680386124960222, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.020807773573324084, "learning_rate": 7.296e-06, "loss": -0.0111, "num_tokens": 81410778.0, "reward": 3.9669175148010254, "reward_std": 0.18714292347431183, "rewards/reward_fn/mean": 3.9669175148010254, "rewards/reward_fn/std": 0.18714292347431183, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 344.90625, "completions/mean_terminated_length": 344.90625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.1869099395353771, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.024345112266018987, "learning_rate": 7.2956e-06, "loss": 0.0946, "num_tokens": 81475991.0, "reward": 3.33730149269104, "reward_std": 0.9492329955101013, "rewards/reward_fn/mean": 3.33730149269104, "rewards/reward_fn/std": 0.9492329955101013, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 295.0, "completions/mean_terminated_length": 295.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.18701601782115201, "frac_reward_zero_std": 1.0, "grad_norm": 0.13671875, "kl": 0.023923526634462178, "learning_rate": 7.2952e-06, "loss": 0.001, "num_tokens": 81534167.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 214.0, "completions/max_terminated_length": 214.0, "completions/mean_length": 159.0, "completions/mean_terminated_length": 159.0, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.1871220961069269, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.03396447142586112, "learning_rate": 7.2948e-06, "loss": 0.0014, "num_tokens": 81580951.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1609.0, "completions/max_terminated_length": 1609.0, "completions/mean_length": 445.4375, "completions/mean_terminated_length": 445.4375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.1872281743927018, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.021805549738928676, "learning_rate": 7.2944e-06, "loss": 0.0994, "num_tokens": 81633669.0, "reward": 2.8578386306762695, "reward_std": 0.36610347032546997, "rewards/reward_fn/mean": 2.8578386306762695, "rewards/reward_fn/std": 0.36610347032546997, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 206.96875, "completions/mean_terminated_length": 206.96875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.18733425267847673, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.0180403123376891, "learning_rate": 7.293999999999999e-06, "loss": 0.0031, "num_tokens": 81672036.0, "reward": 2.8211755752563477, "reward_std": 0.21864524483680725, "rewards/reward_fn/mean": 2.8211755752563477, "rewards/reward_fn/std": 0.21864525973796844, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 330.3125, "completions/mean_terminated_length": 330.3125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.1874403309642516, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.019215510925278068, "learning_rate": 7.293599999999999e-06, "loss": 0.0027, "num_tokens": 81722926.0, "reward": 3.3021492958068848, "reward_std": 0.44588205218315125, "rewards/reward_fn/mean": 3.3021492958068848, "rewards/reward_fn/std": 0.44588202238082886, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 301.84375, "completions/mean_terminated_length": 301.84375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.18754640925002652, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.02523637586273253, "learning_rate": 7.293199999999999e-06, "loss": -0.0158, "num_tokens": 81764425.0, "reward": 2.5516433715820312, "reward_std": 0.436400830745697, "rewards/reward_fn/mean": 2.5516433715820312, "rewards/reward_fn/std": 0.43640080094337463, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 237.59375, "completions/mean_terminated_length": 237.59375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.1876524875358014, "frac_reward_zero_std": 1.0, "grad_norm": 0.12353515625, "kl": 0.032501579727977514, "learning_rate": 7.292799999999999e-06, "loss": 0.0013, "num_tokens": 81807548.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1532.0, "completions/mean_length": 513.40625, "completions/mean_terminated_length": 463.9031982421875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.18775856582157632, "frac_reward_zero_std": 0.0, "grad_norm": 1.953125, "kl": 0.026550387497991323, "learning_rate": 7.2924e-06, "loss": 0.1681, "num_tokens": 81863881.0, "reward": 2.7122297286987305, "reward_std": 0.6148126721382141, "rewards/reward_fn/mean": 2.7122297286987305, "rewards/reward_fn/std": 0.6148126721382141, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1070.0, "completions/max_terminated_length": 1070.0, "completions/mean_length": 288.84375, "completions/mean_terminated_length": 288.84375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.18786464410735124, "frac_reward_zero_std": 1.0, "grad_norm": 0.1015625, "kl": 0.026848802575841546, "learning_rate": 7.292e-06, "loss": 0.0011, "num_tokens": 81909060.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 228.34375, "completions/mean_terminated_length": 228.34375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.18797072239312612, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.033349854638800025, "learning_rate": 7.2916e-06, "loss": 0.0913, "num_tokens": 81959279.0, "reward": 3.149571180343628, "reward_std": 0.5094886422157288, "rewards/reward_fn/mean": 3.149571180343628, "rewards/reward_fn/std": 0.509488582611084, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 191.5, "completions/mean_terminated_length": 191.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.18807680067890103, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.02406395087018609, "learning_rate": 7.2912e-06, "loss": 0.001, "num_tokens": 81996223.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1446.0, "completions/mean_length": 579.75, "completions/mean_terminated_length": 481.86669921875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.18818287896467592, "frac_reward_zero_std": 0.0, "grad_norm": 1.8046875, "kl": 0.03457389399409294, "learning_rate": 7.2908e-06, "loss": 0.274, "num_tokens": 82045591.0, "reward": 2.5302791595458984, "reward_std": 0.7885620594024658, "rewards/reward_fn/mean": 2.5302791595458984, "rewards/reward_fn/std": 0.7885620594024658, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1887.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 469.25, "completions/mean_terminated_length": 469.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.18828895725045083, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.029545899014919996, "learning_rate": 7.2904e-06, "loss": 0.1866, "num_tokens": 82123359.0, "reward": 3.565403699874878, "reward_std": 0.9191962480545044, "rewards/reward_fn/mean": 3.565403699874878, "rewards/reward_fn/std": 0.9191962480545044, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 164.96875, "completions/mean_terminated_length": 164.96875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.18839503553622575, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.02256469742860645, "learning_rate": 7.29e-06, "loss": 0.0788, "num_tokens": 82165246.0, "reward": 3.9026288986206055, "reward_std": 0.3076043426990509, "rewards/reward_fn/mean": 3.9026288986206055, "rewards/reward_fn/std": 0.3076043725013733, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 214.96875, "completions/mean_terminated_length": 214.96875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.18850111382200063, "frac_reward_zero_std": 1.0, "grad_norm": 0.11767578125, "kl": 0.02400966896675527, "learning_rate": 7.2896e-06, "loss": 0.001, "num_tokens": 82206333.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 290.875, "completions/mean_terminated_length": 290.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.18860719210777555, "frac_reward_zero_std": 1.0, "grad_norm": 0.146484375, "kl": 0.021961679798550904, "learning_rate": 7.2892e-06, "loss": 0.0009, "num_tokens": 82255705.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 878.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 321.21875, "completions/mean_terminated_length": 321.21875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.18871327039355043, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.02163318544626236, "learning_rate": 7.2887999999999996e-06, "loss": 0.0009, "num_tokens": 82303200.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1254.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 205.6875, "completions/mean_terminated_length": 205.6875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.18881934867932534, "frac_reward_zero_std": 1.0, "grad_norm": 0.09375, "kl": 0.020103381713852286, "learning_rate": 7.2883999999999995e-06, "loss": 0.0008, "num_tokens": 82342710.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 294.5625, "completions/mean_terminated_length": 294.5625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.18892542696510026, "frac_reward_zero_std": 0.0, "grad_norm": 1.4453125, "kl": 0.023567187832668424, "learning_rate": 7.2879999999999995e-06, "loss": 0.0174, "num_tokens": 82395080.0, "reward": 3.1176083087921143, "reward_std": 0.5186969041824341, "rewards/reward_fn/mean": 3.1176083087921143, "rewards/reward_fn/std": 0.5186969041824341, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 316.5625, "completions/mean_terminated_length": 316.5625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.18903150525087514, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.024137710686773062, "learning_rate": 7.2876e-06, "loss": -0.0694, "num_tokens": 82450394.0, "reward": 2.7653896808624268, "reward_std": 0.0928201898932457, "rewards/reward_fn/mean": 2.7653896808624268, "rewards/reward_fn/std": 0.09282021969556808, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1543.0, "completions/max_terminated_length": 1543.0, "completions/mean_length": 410.03125, "completions/mean_terminated_length": 410.03125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.18913758353665006, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.037990636890754104, "learning_rate": 7.2872e-06, "loss": 0.023, "num_tokens": 82499099.0, "reward": 2.971921443939209, "reward_std": 0.3378731310367584, "rewards/reward_fn/mean": 2.971921443939209, "rewards/reward_fn/std": 0.3378731608390808, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 211.96875, "completions/mean_terminated_length": 211.96875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.18924366182242494, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.025865831412374973, "learning_rate": 7.2868e-06, "loss": -0.0036, "num_tokens": 82542042.0, "reward": 3.0052905082702637, "reward_std": 0.05251338332891464, "rewards/reward_fn/mean": 3.0052905082702637, "rewards/reward_fn/std": 0.05251337215304375, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1905.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 417.40625, "completions/mean_terminated_length": 417.40625, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.18934974010819985, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.021670245798304677, "learning_rate": 7.2864e-06, "loss": 0.141, "num_tokens": 82600455.0, "reward": 2.866835594177246, "reward_std": 0.07305392622947693, "rewards/reward_fn/mean": 2.866835594177246, "rewards/reward_fn/std": 0.07305389642715454, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 151.4375, "completions/mean_terminated_length": 151.4375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.18945581839397474, "frac_reward_zero_std": 1.0, "grad_norm": 0.087890625, "kl": 0.020182526553981006, "learning_rate": 7.285999999999999e-06, "loss": 0.0008, "num_tokens": 82641301.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 857.53125, "completions/mean_terminated_length": 734.3793334960938, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.18956189667974965, "frac_reward_zero_std": 0.0, "grad_norm": 1.1640625, "kl": 0.018903368851169944, "learning_rate": 7.285599999999999e-06, "loss": 0.1813, "num_tokens": 82711942.0, "reward": 2.3174896240234375, "reward_std": 0.9160798192024231, "rewards/reward_fn/mean": 2.3174896240234375, "rewards/reward_fn/std": 0.9160798192024231, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1586.0, "completions/mean_length": 443.40625, "completions/mean_terminated_length": 391.6451416015625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.18966797496552457, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.029990455135703087, "learning_rate": 7.285199999999999e-06, "loss": 0.2298, "num_tokens": 82789395.0, "reward": 2.748314380645752, "reward_std": 0.512526273727417, "rewards/reward_fn/mean": 2.748314380645752, "rewards/reward_fn/std": 0.512526273727417, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 315.90625, "completions/mean_terminated_length": 315.90625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.18977405325129945, "frac_reward_zero_std": 0.0, "grad_norm": 1.40625, "kl": 0.01661941665224731, "learning_rate": 7.284799999999999e-06, "loss": -0.0135, "num_tokens": 82847696.0, "reward": 3.9684324264526367, "reward_std": 0.1785728633403778, "rewards/reward_fn/mean": 3.9684324264526367, "rewards/reward_fn/std": 0.1785728633403778, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 163.0625, "completions/mean_terminated_length": 163.0625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.18988013153707436, "frac_reward_zero_std": 1.0, "grad_norm": 0.076171875, "kl": 0.016163927502930164, "learning_rate": 7.284399999999999e-06, "loss": 0.0006, "num_tokens": 82904786.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 196.90625, "completions/mean_terminated_length": 196.90625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.18998620982284925, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.021043762797489762, "learning_rate": 7.283999999999999e-06, "loss": 0.0344, "num_tokens": 82945903.0, "reward": 2.891822338104248, "reward_std": 0.2067325860261917, "rewards/reward_fn/mean": 2.891822338104248, "rewards/reward_fn/std": 0.20673255622386932, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 138.78125, "completions/mean_terminated_length": 138.78125, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "epoch": 0.19009228810862416, "frac_reward_zero_std": 0.0, "grad_norm": 2.53125, "kl": 0.042682474479079247, "learning_rate": 7.283599999999999e-06, "loss": 0.0077, "num_tokens": 82982600.0, "reward": 3.008596181869507, "reward_std": 0.0607355572283268, "rewards/reward_fn/mean": 3.008596181869507, "rewards/reward_fn/std": 0.060735564678907394, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1132.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 429.90625, "completions/mean_terminated_length": 429.90625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.19019836639439908, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.021921277744695544, "learning_rate": 7.283199999999999e-06, "loss": 0.1219, "num_tokens": 83020037.0, "reward": 2.7947468757629395, "reward_std": 0.028822239488363266, "rewards/reward_fn/mean": 2.7947468757629395, "rewards/reward_fn/std": 0.02882222831249237, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 565.5, "completions/mean_terminated_length": 565.5, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.19030444468017396, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.021824662806466222, "learning_rate": 7.2828e-06, "loss": 0.208, "num_tokens": 83060469.0, "reward": 3.3677079677581787, "reward_std": 0.8257449865341187, "rewards/reward_fn/mean": 3.3677079677581787, "rewards/reward_fn/std": 0.8257449269294739, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 269.1875, "completions/mean_terminated_length": 269.1875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.19041052296594888, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.02564499992877245, "learning_rate": 7.2824e-06, "loss": -0.0528, "num_tokens": 83100443.0, "reward": 3.9307758808135986, "reward_std": 0.39159107208251953, "rewards/reward_fn/mean": 3.9307758808135986, "rewards/reward_fn/std": 0.39159104228019714, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1098.0, "completions/max_terminated_length": 1098.0, "completions/mean_length": 286.53125, "completions/mean_terminated_length": 286.53125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.19051660125172376, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.038170166313648224, "learning_rate": 7.282e-06, "loss": 0.0912, "num_tokens": 83139020.0, "reward": 3.124636650085449, "reward_std": 0.257290244102478, "rewards/reward_fn/mean": 3.124636650085449, "rewards/reward_fn/std": 0.25729018449783325, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 335.84375, "completions/mean_terminated_length": 335.84375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.19062267953749867, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.026702363276854157, "learning_rate": 7.2816e-06, "loss": 0.0011, "num_tokens": 83180263.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 174.8125, "completions/mean_terminated_length": 174.8125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.1907287578232736, "frac_reward_zero_std": 0.0, "grad_norm": 1.9140625, "kl": 0.021874910918995738, "learning_rate": 7.2812e-06, "loss": 0.0139, "num_tokens": 83227905.0, "reward": 3.0870790481567383, "reward_std": 0.4484405815601349, "rewards/reward_fn/mean": 3.0870790481567383, "rewards/reward_fn/std": 0.4484405815601349, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 280.53125, "completions/mean_terminated_length": 280.53125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.19083483610904847, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.022913979832082987, "learning_rate": 7.2808e-06, "loss": 0.0525, "num_tokens": 83266994.0, "reward": 3.892354965209961, "reward_std": 0.444100558757782, "rewards/reward_fn/mean": 3.892354965209961, "rewards/reward_fn/std": 0.4441005289554596, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 296.0625, "completions/mean_terminated_length": 296.0625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.19094091439482339, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.021208511432632804, "learning_rate": 7.2804e-06, "loss": -0.0788, "num_tokens": 83294420.0, "reward": 3.3324835300445557, "reward_std": 0.3951815366744995, "rewards/reward_fn/mean": 3.3324835300445557, "rewards/reward_fn/std": 0.3951815366744995, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 343.90625, "completions/mean_terminated_length": 343.90625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.19104699268059827, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.020625728298909962, "learning_rate": 7.28e-06, "loss": -0.0446, "num_tokens": 83343409.0, "reward": 3.7953171730041504, "reward_std": 0.5966523885726929, "rewards/reward_fn/mean": 3.7953171730041504, "rewards/reward_fn/std": 0.5966523289680481, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 194.0, "completions/max_terminated_length": 194.0, "completions/mean_length": 147.21875, "completions/mean_terminated_length": 147.21875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.19115307096637318, "frac_reward_zero_std": 1.0, "grad_norm": 0.08984375, "kl": 0.01696223858743906, "learning_rate": 7.2796e-06, "loss": 0.0007, "num_tokens": 83383480.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 211.21875, "completions/mean_terminated_length": 211.21875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.1912591492521481, "frac_reward_zero_std": 1.0, "grad_norm": 0.103515625, "kl": 0.02501624054275453, "learning_rate": 7.2792e-06, "loss": 0.001, "num_tokens": 83426655.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 241.34375, "completions/mean_terminated_length": 241.34375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.19136522753792298, "frac_reward_zero_std": 1.0, "grad_norm": 0.1171875, "kl": 0.024438521591946483, "learning_rate": 7.2788e-06, "loss": 0.001, "num_tokens": 83454922.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 107.0625, "completions/mean_terminated_length": 107.0625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.1914713058236979, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.01948395639192313, "learning_rate": 7.2784000000000005e-06, "loss": 0.0008, "num_tokens": 83491020.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 381.8125, "completions/mean_terminated_length": 381.8125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.19157738410947278, "frac_reward_zero_std": 0.0, "grad_norm": 0.96875, "kl": 0.025701815960928798, "learning_rate": 7.278e-06, "loss": -0.0646, "num_tokens": 83524998.0, "reward": 3.9303698539733887, "reward_std": 0.2739916145801544, "rewards/reward_fn/mean": 3.9303698539733887, "rewards/reward_fn/std": 0.2739916443824768, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 215.03125, "completions/mean_terminated_length": 215.03125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.1916834623952477, "frac_reward_zero_std": 1.0, "grad_norm": 0.08642578125, "kl": 0.0199962422484532, "learning_rate": 7.2775999999999996e-06, "loss": 0.0008, "num_tokens": 83564807.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 207.875, "completions/mean_terminated_length": 207.875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.1917895406810226, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.02291672769933939, "learning_rate": 7.2771999999999995e-06, "loss": -0.0238, "num_tokens": 83606563.0, "reward": 3.8847744464874268, "reward_std": 0.31120461225509644, "rewards/reward_fn/mean": 3.8847744464874268, "rewards/reward_fn/std": 0.31120461225509644, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 68.5, "completions/mean_terminated_length": 68.5, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.1918956189667975, "frac_reward_zero_std": 1.0, "grad_norm": 0.193359375, "kl": 0.029097398975864053, "learning_rate": 7.2767999999999995e-06, "loss": 0.0012, "num_tokens": 83639059.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 261.84375, "completions/mean_terminated_length": 261.84375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.1920016972525724, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.02024500584229827, "learning_rate": 7.2763999999999995e-06, "loss": 0.0566, "num_tokens": 83690190.0, "reward": 3.8510398864746094, "reward_std": 0.40058434009552, "rewards/reward_fn/mean": 3.8510398864746094, "rewards/reward_fn/std": 0.4005843698978424, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 936.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 261.5, "completions/mean_terminated_length": 261.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.1921077755383473, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.02715472411364317, "learning_rate": 7.2759999999999995e-06, "loss": -0.0709, "num_tokens": 83734110.0, "reward": 3.7524542808532715, "reward_std": 0.5632277131080627, "rewards/reward_fn/mean": 3.7524542808532715, "rewards/reward_fn/std": 0.5632277131080627, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 153.375, "completions/mean_terminated_length": 153.375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.1922138538241222, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.014898709952831268, "learning_rate": 7.2755999999999994e-06, "loss": 0.0006, "num_tokens": 83774794.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 536.3125, "completions/mean_terminated_length": 435.5333557128906, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.1923199321098971, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.02957181097008288, "learning_rate": 7.275199999999999e-06, "loss": 0.2706, "num_tokens": 83851540.0, "reward": 3.250636100769043, "reward_std": 1.182026982307434, "rewards/reward_fn/mean": 3.250636100769043, "rewards/reward_fn/std": 1.182026982307434, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1330.0, "completions/max_terminated_length": 1330.0, "completions/mean_length": 369.59375, "completions/mean_terminated_length": 369.59375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.192426010395672, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.02410293696448207, "learning_rate": 7.274799999999999e-06, "loss": -0.0188, "num_tokens": 83894567.0, "reward": 3.713106870651245, "reward_std": 0.7775752544403076, "rewards/reward_fn/mean": 3.713106870651245, "rewards/reward_fn/std": 0.7775752544403076, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1097.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 406.15625, "completions/mean_terminated_length": 406.15625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.19253208868144692, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.02121006278321147, "learning_rate": 7.274399999999999e-06, "loss": 0.1567, "num_tokens": 83928780.0, "reward": 2.624467611312866, "reward_std": 0.3308447301387787, "rewards/reward_fn/mean": 2.624467611312866, "rewards/reward_fn/std": 0.3308447301387787, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 209.875, "completions/mean_terminated_length": 209.875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.1926381669672218, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.02947010798379779, "learning_rate": 7.273999999999999e-06, "loss": 0.0666, "num_tokens": 83981352.0, "reward": 3.722154140472412, "reward_std": 0.3915632367134094, "rewards/reward_fn/mean": 3.722154140472412, "rewards/reward_fn/std": 0.3915632665157318, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 360.75, "completions/mean_terminated_length": 360.75, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.19274424525299672, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.022145352559164166, "learning_rate": 7.2736e-06, "loss": 0.0137, "num_tokens": 84012608.0, "reward": 3.703070640563965, "reward_std": 0.5241351127624512, "rewards/reward_fn/mean": 3.703070640563965, "rewards/reward_fn/std": 0.5241351127624512, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1276.0, "completions/max_terminated_length": 1276.0, "completions/mean_length": 393.71875, "completions/mean_terminated_length": 393.71875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.1928503235387716, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.0181597942719236, "learning_rate": 7.2732e-06, "loss": -0.049, "num_tokens": 84062103.0, "reward": 3.3613734245300293, "reward_std": 0.6499727368354797, "rewards/reward_fn/mean": 3.3613734245300293, "rewards/reward_fn/std": 0.6499727964401245, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 246.6875, "completions/mean_terminated_length": 246.6875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.19295640182454651, "frac_reward_zero_std": 1.0, "grad_norm": 0.1171875, "kl": 0.027381795225664973, "learning_rate": 7.2728e-06, "loss": 0.0011, "num_tokens": 84111693.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 305.625, "completions/mean_terminated_length": 305.625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.19306248011032143, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.021608802489936352, "learning_rate": 7.2724e-06, "loss": 0.0009, "num_tokens": 84154113.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 409.59375, "completions/mean_terminated_length": 356.7419128417969, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.1931685583960963, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.025535868713632226, "learning_rate": 7.272e-06, "loss": 0.294, "num_tokens": 84195860.0, "reward": 2.834456443786621, "reward_std": 0.5542919635772705, "rewards/reward_fn/mean": 2.834456443786621, "rewards/reward_fn/std": 0.5542919635772705, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 208.875, "completions/mean_terminated_length": 208.875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.19327463668187123, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.015965044614858925, "learning_rate": 7.2716e-06, "loss": 0.0429, "num_tokens": 84235664.0, "reward": 3.87819242477417, "reward_std": 0.4338008165359497, "rewards/reward_fn/mean": 3.87819242477417, "rewards/reward_fn/std": 0.4338007867336273, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 215.90625, "completions/mean_terminated_length": 215.90625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.1933807149676461, "frac_reward_zero_std": 1.0, "grad_norm": 0.07666015625, "kl": 0.020084471092559397, "learning_rate": 7.2712e-06, "loss": 0.0008, "num_tokens": 84294797.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1409.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 350.53125, "completions/mean_terminated_length": 350.53125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.19348679325342102, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.022387655219063163, "learning_rate": 7.2708e-06, "loss": 0.1434, "num_tokens": 84348126.0, "reward": 3.8110899925231934, "reward_std": 0.44719943404197693, "rewards/reward_fn/mean": 3.8110899925231934, "rewards/reward_fn/std": 0.44719937443733215, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1687.0, "completions/mean_length": 395.15625, "completions/mean_terminated_length": 341.8387145996094, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.19359287153919594, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.02443688898347318, "learning_rate": 7.2704e-06, "loss": 0.3221, "num_tokens": 84409091.0, "reward": 2.8401429653167725, "reward_std": 0.5243302583694458, "rewards/reward_fn/mean": 2.8401429653167725, "rewards/reward_fn/std": 0.5243302583694458, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 218.15625, "completions/mean_terminated_length": 218.15625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.19369894982497082, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.024313464295119047, "learning_rate": 7.269999999999999e-06, "loss": -0.1331, "num_tokens": 84437768.0, "reward": 3.7873826026916504, "reward_std": 0.6716558337211609, "rewards/reward_fn/mean": 3.7873826026916504, "rewards/reward_fn/std": 0.6716558337211609, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1220.0, "completions/max_terminated_length": 1220.0, "completions/mean_length": 493.40625, "completions/mean_terminated_length": 493.40625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.19380502811074574, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.020324668614193797, "learning_rate": 7.269599999999999e-06, "loss": -0.0033, "num_tokens": 84492021.0, "reward": 3.0298476219177246, "reward_std": 0.3211779296398163, "rewards/reward_fn/mean": 3.0298476219177246, "rewards/reward_fn/std": 0.3211778700351715, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1439.0, "completions/mean_length": 569.59375, "completions/mean_terminated_length": 471.0333557128906, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.19391110639652062, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.022751206997781992, "learning_rate": 7.269199999999999e-06, "loss": 0.3061, "num_tokens": 84543784.0, "reward": 3.6765613555908203, "reward_std": 1.0499624013900757, "rewards/reward_fn/mean": 3.6765613555908203, "rewards/reward_fn/std": 1.0499624013900757, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 322.9375, "completions/mean_terminated_length": 322.9375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.19401718468229553, "frac_reward_zero_std": 1.0, "grad_norm": 0.083984375, "kl": 0.028194759273901582, "learning_rate": 7.2688e-06, "loss": 0.0011, "num_tokens": 84589094.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 136.96875, "completions/mean_terminated_length": 136.96875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.19412326296807045, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.022485103458166122, "learning_rate": 7.2684e-06, "loss": 0.0009, "num_tokens": 84616997.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 452.59375, "completions/mean_terminated_length": 452.59375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.19422934125384533, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.024805004009976983, "learning_rate": 7.268e-06, "loss": 0.0591, "num_tokens": 84664056.0, "reward": 2.991624355316162, "reward_std": 0.3909732401371002, "rewards/reward_fn/mean": 2.991624355316162, "rewards/reward_fn/std": 0.3909732699394226, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 270.65625, "completions/mean_terminated_length": 270.65625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.19433541953962025, "frac_reward_zero_std": 0.0, "grad_norm": 1.65625, "kl": 0.030655317706987262, "learning_rate": 7.2676e-06, "loss": 0.0186, "num_tokens": 84705997.0, "reward": 3.4060113430023193, "reward_std": 0.6430683135986328, "rewards/reward_fn/mean": 3.4060113430023193, "rewards/reward_fn/std": 0.643068253993988, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 215.375, "completions/mean_terminated_length": 215.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.19444149782539513, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.027888069627806544, "learning_rate": 7.2672e-06, "loss": -0.0243, "num_tokens": 84754777.0, "reward": 3.6079330444335938, "reward_std": 0.5912787914276123, "rewards/reward_fn/mean": 3.6079330444335938, "rewards/reward_fn/std": 0.5912788510322571, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1301.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 343.625, "completions/mean_terminated_length": 343.625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.19454757611117005, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.027522264514118433, "learning_rate": 7.2668e-06, "loss": 0.2485, "num_tokens": 84806573.0, "reward": 3.0881223678588867, "reward_std": 0.046560484915971756, "rewards/reward_fn/mean": 3.0881223678588867, "rewards/reward_fn/std": 0.04656045511364937, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1224.0, "completions/max_terminated_length": 1224.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.19465365439694496, "frac_reward_zero_std": 1.0, "grad_norm": 0.10302734375, "kl": 0.0238818796351552, "learning_rate": 7.2664e-06, "loss": 0.001, "num_tokens": 84847717.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 346.59375, "completions/mean_terminated_length": 346.59375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.19475973268271984, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.0209172077011317, "learning_rate": 7.2659999999999996e-06, "loss": 0.2055, "num_tokens": 84896376.0, "reward": 2.8978328704833984, "reward_std": 0.07092181593179703, "rewards/reward_fn/mean": 2.8978328704833984, "rewards/reward_fn/std": 0.07092180103063583, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 639.03125, "completions/mean_terminated_length": 639.03125, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.19486581096849476, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.02236326946876943, "learning_rate": 7.2655999999999995e-06, "loss": 0.0086, "num_tokens": 84934841.0, "reward": 2.4415290355682373, "reward_std": 0.6754915714263916, "rewards/reward_fn/mean": 2.4415290355682373, "rewards/reward_fn/std": 0.6754916310310364, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 352.0, "completions/mean_terminated_length": 352.0, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.19497188925426964, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.022676357068121433, "learning_rate": 7.2651999999999995e-06, "loss": 0.0294, "num_tokens": 84990361.0, "reward": 3.776266574859619, "reward_std": 0.7067487239837646, "rewards/reward_fn/mean": 3.776266574859619, "rewards/reward_fn/std": 0.7067488431930542, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 185.21875, "completions/mean_terminated_length": 185.21875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.19507796754004456, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.018021252821199596, "learning_rate": 7.2647999999999995e-06, "loss": 0.0134, "num_tokens": 85050912.0, "reward": 3.879213571548462, "reward_std": 0.3250885605812073, "rewards/reward_fn/mean": 3.879213571548462, "rewards/reward_fn/std": 0.3250885307788849, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1688.0, "completions/max_terminated_length": 1688.0, "completions/mean_length": 507.25, "completions/mean_terminated_length": 507.25, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.19518404582581944, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.026730315992608666, "learning_rate": 7.2644e-06, "loss": 0.1458, "num_tokens": 85098440.0, "reward": 2.9435012340545654, "reward_std": 0.4082188308238983, "rewards/reward_fn/mean": 2.9435012340545654, "rewards/reward_fn/std": 0.4082188010215759, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 227.65625, "completions/mean_terminated_length": 227.65625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.19529012411159435, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.025480588898062706, "learning_rate": 7.264e-06, "loss": 0.1826, "num_tokens": 85142173.0, "reward": 2.987659454345703, "reward_std": 0.13073208928108215, "rewards/reward_fn/mean": 2.987659454345703, "rewards/reward_fn/std": 0.13073207437992096, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 194.46875, "completions/mean_terminated_length": 194.46875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.19539620239736927, "frac_reward_zero_std": 1.0, "grad_norm": 0.1181640625, "kl": 0.027264825999736786, "learning_rate": 7.2636e-06, "loss": 0.0011, "num_tokens": 85177772.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1085.0, "completions/max_terminated_length": 1085.0, "completions/mean_length": 367.625, "completions/mean_terminated_length": 367.625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.19550228068314415, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.027547943871468306, "learning_rate": 7.2632e-06, "loss": 0.0731, "num_tokens": 85233824.0, "reward": 3.5169026851654053, "reward_std": 0.6616706848144531, "rewards/reward_fn/mean": 3.5169026851654053, "rewards/reward_fn/std": 0.6616706848144531, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1470.0, "completions/max_terminated_length": 1470.0, "completions/mean_length": 304.5625, "completions/mean_terminated_length": 304.5625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.19560835896891907, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.02738300757482648, "learning_rate": 7.2628e-06, "loss": 0.1332, "num_tokens": 85276146.0, "reward": 3.798513889312744, "reward_std": 0.4765705466270447, "rewards/reward_fn/mean": 3.798513889312744, "rewards/reward_fn/std": 0.47657057642936707, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 320.40625, "completions/mean_terminated_length": 320.40625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.19571443725469395, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.02806469239294529, "learning_rate": 7.2624e-06, "loss": 0.0161, "num_tokens": 85323135.0, "reward": 3.9619522094726562, "reward_std": 0.21523013710975647, "rewards/reward_fn/mean": 3.9619522094726562, "rewards/reward_fn/std": 0.21523013710975647, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 312.78125, "completions/mean_terminated_length": 312.78125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.19582051554046886, "frac_reward_zero_std": 0.0, "grad_norm": 1.4921875, "kl": 0.02015020337421447, "learning_rate": 7.261999999999999e-06, "loss": 0.0248, "num_tokens": 85371896.0, "reward": 3.5994532108306885, "reward_std": 0.672566294670105, "rewards/reward_fn/mean": 3.5994532108306885, "rewards/reward_fn/std": 0.672566294670105, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 294.375, "completions/mean_terminated_length": 294.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.19592659382624378, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.02237825153861195, "learning_rate": 7.261599999999999e-06, "loss": 0.0013, "num_tokens": 85417892.0, "reward": 3.958686351776123, "reward_std": 0.23370474576950073, "rewards/reward_fn/mean": 3.958686351776123, "rewards/reward_fn/std": 0.23370479047298431, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1311.0, "completions/max_terminated_length": 1311.0, "completions/mean_length": 368.375, "completions/mean_terminated_length": 368.375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.19603267211201866, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.025957881240174174, "learning_rate": 7.261199999999999e-06, "loss": -0.0124, "num_tokens": 85469008.0, "reward": 2.98433256149292, "reward_std": 0.6673187017440796, "rewards/reward_fn/mean": 2.98433256149292, "rewards/reward_fn/std": 0.6673187017440796, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 221.5, "completions/mean_terminated_length": 221.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.19613875039779358, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.03066243208013475, "learning_rate": 7.260799999999999e-06, "loss": 0.0189, "num_tokens": 85520032.0, "reward": 3.3312742710113525, "reward_std": 0.5270886421203613, "rewards/reward_fn/mean": 3.3312742710113525, "rewards/reward_fn/std": 0.5270887017250061, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 292.875, "completions/mean_terminated_length": 292.875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.19624482868356846, "frac_reward_zero_std": 0.0, "grad_norm": 2.1875, "kl": 0.02144060772843659, "learning_rate": 7.260399999999999e-06, "loss": 0.0602, "num_tokens": 85556412.0, "reward": 3.9658844470977783, "reward_std": 0.19298657774925232, "rewards/reward_fn/mean": 3.9658844470977783, "rewards/reward_fn/std": 0.19298657774925232, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 161.28125, "completions/mean_terminated_length": 161.28125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.19635090696934338, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.021163585828617215, "learning_rate": 7.259999999999999e-06, "loss": -0.0845, "num_tokens": 85610405.0, "reward": 2.866489887237549, "reward_std": 0.5268675684928894, "rewards/reward_fn/mean": 2.866489887237549, "rewards/reward_fn/std": 0.5268676280975342, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1304.0, "completions/max_terminated_length": 1304.0, "completions/mean_length": 469.71875, "completions/mean_terminated_length": 469.71875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.1964569852551183, "frac_reward_zero_std": 0.0, "grad_norm": 1.15625, "kl": 0.017452375264838338, "learning_rate": 7.2596e-06, "loss": 0.0485, "num_tokens": 85657788.0, "reward": 3.1726508140563965, "reward_std": 0.7422636151313782, "rewards/reward_fn/mean": 3.1726508140563965, "rewards/reward_fn/std": 0.7422636151313782, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1580.0, "completions/max_terminated_length": 1580.0, "completions/mean_length": 435.03125, "completions/mean_terminated_length": 435.03125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.19656306354089317, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.023819874972105026, "learning_rate": 7.2592e-06, "loss": -0.0031, "num_tokens": 85716285.0, "reward": 3.243497848510742, "reward_std": 0.48213231563568115, "rewards/reward_fn/mean": 3.243497848510742, "rewards/reward_fn/std": 0.48213231563568115, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1349.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 244.375, "completions/mean_terminated_length": 244.375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.1966691418266681, "frac_reward_zero_std": 1.0, "grad_norm": 0.0966796875, "kl": 0.02474185894243419, "learning_rate": 7.2588e-06, "loss": 0.001, "num_tokens": 85761929.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 437.03125, "completions/mean_terminated_length": 437.03125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.19677522011244297, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.024578659562394023, "learning_rate": 7.2584e-06, "loss": -0.0169, "num_tokens": 85827050.0, "reward": 3.555436134338379, "reward_std": 0.7116384506225586, "rewards/reward_fn/mean": 3.555436134338379, "rewards/reward_fn/std": 0.7116385102272034, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 849.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 308.8125, "completions/mean_terminated_length": 308.8125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.19688129839821789, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.02469022199511528, "learning_rate": 7.258e-06, "loss": 0.0686, "num_tokens": 85877028.0, "reward": 2.7643117904663086, "reward_std": 0.04094107821583748, "rewards/reward_fn/mean": 2.7643117904663086, "rewards/reward_fn/std": 0.0409410260617733, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 129.3125, "completions/mean_terminated_length": 129.3125, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.1969873766839928, "frac_reward_zero_std": 1.0, "grad_norm": 0.115234375, "kl": 0.02429241011850536, "learning_rate": 7.2576e-06, "loss": 0.001, "num_tokens": 85896526.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 231.03125, "completions/mean_terminated_length": 231.03125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.19709345496976768, "frac_reward_zero_std": 1.0, "grad_norm": 0.08740234375, "kl": 0.02231363148894161, "learning_rate": 7.2572e-06, "loss": 0.0009, "num_tokens": 85946607.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1097.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 171.71875, "completions/mean_terminated_length": 171.71875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.1971995332555426, "frac_reward_zero_std": 0.0, "grad_norm": 4.34375, "kl": 0.02453322766814381, "learning_rate": 7.2568e-06, "loss": 0.2765, "num_tokens": 85989382.0, "reward": 3.923966646194458, "reward_std": 0.2992479205131531, "rewards/reward_fn/mean": 3.923966646194458, "rewards/reward_fn/std": 0.2992479205131531, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 283.9375, "completions/mean_terminated_length": 283.9375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.19730561154131748, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.025952878408133984, "learning_rate": 7.2564e-06, "loss": 0.0263, "num_tokens": 86037284.0, "reward": 3.002805233001709, "reward_std": 0.48838910460472107, "rewards/reward_fn/mean": 3.002805233001709, "rewards/reward_fn/std": 0.4883890450000763, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 319.28125, "completions/mean_terminated_length": 319.28125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.1974116898270924, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.02484931843355298, "learning_rate": 7.256e-06, "loss": 0.0074, "num_tokens": 86085517.0, "reward": 3.792180061340332, "reward_std": 0.5296205878257751, "rewards/reward_fn/mean": 3.792180061340332, "rewards/reward_fn/std": 0.5296205878257751, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1432.0, "completions/mean_length": 391.53125, "completions/mean_terminated_length": 338.0967712402344, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.1975177681128673, "frac_reward_zero_std": 0.0, "grad_norm": 1.0625, "kl": 0.023601802764460444, "learning_rate": 7.2556e-06, "loss": 0.3086, "num_tokens": 86129406.0, "reward": 3.803504228591919, "reward_std": 0.8031951189041138, "rewards/reward_fn/mean": 3.803504228591919, "rewards/reward_fn/std": 0.8031951785087585, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 215.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.1976238463986422, "frac_reward_zero_std": 1.0, "grad_norm": 0.12060546875, "kl": 0.028308047214522958, "learning_rate": 7.2552e-06, "loss": 0.0011, "num_tokens": 86176090.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1234.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 425.875, "completions/mean_terminated_length": 425.875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.1977299246844171, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.022070118226110935, "learning_rate": 7.2548e-06, "loss": 0.0846, "num_tokens": 86221430.0, "reward": 3.93414306640625, "reward_std": 0.25952455401420593, "rewards/reward_fn/mean": 3.93414306640625, "rewards/reward_fn/std": 0.25952455401420593, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1249.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 464.375, "completions/mean_terminated_length": 464.375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.197836002970192, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.026702645933255553, "learning_rate": 7.2544e-06, "loss": 0.0719, "num_tokens": 86280162.0, "reward": 3.4960038661956787, "reward_std": 0.583372175693512, "rewards/reward_fn/mean": 3.4960038661956787, "rewards/reward_fn/std": 0.583372175693512, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 268.34375, "completions/mean_terminated_length": 268.34375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.1979420812559669, "frac_reward_zero_std": 0.0, "grad_norm": 1.7421875, "kl": 0.03491222928278148, "learning_rate": 7.2539999999999995e-06, "loss": -0.0919, "num_tokens": 86320909.0, "reward": 3.5736351013183594, "reward_std": 0.4795069098472595, "rewards/reward_fn/mean": 3.5736351013183594, "rewards/reward_fn/std": 0.4795069098472595, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 301.21875, "completions/mean_terminated_length": 301.21875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.1980481595417418, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.023877075407654047, "learning_rate": 7.2535999999999995e-06, "loss": -0.022, "num_tokens": 86362292.0, "reward": 3.648834228515625, "reward_std": 0.529570996761322, "rewards/reward_fn/mean": 3.648834228515625, "rewards/reward_fn/std": 0.5295709371566772, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1631.0, "completions/max_terminated_length": 1631.0, "completions/mean_length": 284.46875, "completions/mean_terminated_length": 284.46875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.1981542378275167, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.03326743561774492, "learning_rate": 7.2531999999999994e-06, "loss": 0.0016, "num_tokens": 86419203.0, "reward": 2.758133888244629, "reward_std": 0.042066995054483414, "rewards/reward_fn/mean": 2.758133888244629, "rewards/reward_fn/std": 0.04206700250506401, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 166.84375, "completions/mean_terminated_length": 166.84375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.19826031611329162, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.023756607668474317, "learning_rate": 7.252799999999999e-06, "loss": 0.001, "num_tokens": 86455902.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 290.03125, "completions/mean_terminated_length": 290.03125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.1983663943990665, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.02618178352713585, "learning_rate": 7.252399999999999e-06, "loss": 0.0031, "num_tokens": 86495359.0, "reward": 2.5693161487579346, "reward_std": 0.526289165019989, "rewards/reward_fn/mean": 2.5693161487579346, "rewards/reward_fn/std": 0.526289165019989, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1406.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 240.4375, "completions/mean_terminated_length": 240.4375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.19847247268484142, "frac_reward_zero_std": 0.0, "grad_norm": 1.9296875, "kl": 0.02834569150581956, "learning_rate": 7.251999999999999e-06, "loss": -0.0177, "num_tokens": 86530893.0, "reward": 3.391087055206299, "reward_std": 0.46350401639938354, "rewards/reward_fn/mean": 3.391087055206299, "rewards/reward_fn/std": 0.46350395679473877, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 240.71875, "completions/mean_terminated_length": 240.71875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.1985785509706163, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.024594660149887204, "learning_rate": 7.251599999999999e-06, "loss": 0.001, "num_tokens": 86558980.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 278.96875, "completions/mean_terminated_length": 278.96875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.19868462925639122, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.02399196708574891, "learning_rate": 7.251199999999999e-06, "loss": 0.001, "num_tokens": 86602883.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 296.53125, "completions/mean_terminated_length": 296.53125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.19879070754216613, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.029965325025841594, "learning_rate": 7.250799999999999e-06, "loss": 0.0653, "num_tokens": 86649364.0, "reward": 3.2196264266967773, "reward_std": 0.19695112109184265, "rewards/reward_fn/mean": 3.2196264266967773, "rewards/reward_fn/std": 0.19695109128952026, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 279.03125, "completions/mean_terminated_length": 279.03125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.19889678582794101, "frac_reward_zero_std": 0.0, "grad_norm": 1.015625, "kl": 0.02199955377727747, "learning_rate": 7.250399999999999e-06, "loss": -0.065, "num_tokens": 86696021.0, "reward": 2.7779619693756104, "reward_std": 0.1980944573879242, "rewards/reward_fn/mean": 2.7779619693756104, "rewards/reward_fn/std": 0.19809450209140778, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 601.0, "completions/mean_terminated_length": 554.3225708007812, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.19900286411371593, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.026174088707193732, "learning_rate": 7.25e-06, "loss": 0.1704, "num_tokens": 86767861.0, "reward": 2.664578914642334, "reward_std": 0.6315154433250427, "rewards/reward_fn/mean": 2.664578914642334, "rewards/reward_fn/std": 0.6315154433250427, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 190.21875, "completions/mean_terminated_length": 190.21875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.1991089423994908, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.02615090156905353, "learning_rate": 7.2496e-06, "loss": 0.001, "num_tokens": 86796092.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 94.09375, "completions/mean_terminated_length": 94.09375, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.19921502068526573, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.017111393972299993, "learning_rate": 7.2492e-06, "loss": 0.0007, "num_tokens": 86830815.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 305.34375, "completions/mean_terminated_length": 305.34375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.19932109897104064, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.024205820402130485, "learning_rate": 7.2488e-06, "loss": 0.002, "num_tokens": 86879946.0, "reward": 3.9707565307617188, "reward_std": 0.165426567196846, "rewards/reward_fn/mean": 3.9707565307617188, "rewards/reward_fn/std": 0.16542655229568481, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1539.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 450.59375, "completions/mean_terminated_length": 450.59375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.19942717725681552, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.021435604197904468, "learning_rate": 7.2484e-06, "loss": 0.0263, "num_tokens": 86929725.0, "reward": 3.7882189750671387, "reward_std": 0.4483034312725067, "rewards/reward_fn/mean": 3.7882189750671387, "rewards/reward_fn/std": 0.4483034312725067, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 262.28125, "completions/mean_terminated_length": 262.28125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.19953325554259044, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.023225291399285197, "learning_rate": 7.248e-06, "loss": 0.0906, "num_tokens": 86988614.0, "reward": 3.082364559173584, "reward_std": 0.5728388428688049, "rewards/reward_fn/mean": 3.082364559173584, "rewards/reward_fn/std": 0.5728388428688049, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 569.875, "completions/mean_terminated_length": 569.875, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "epoch": 0.19963933382836532, "frac_reward_zero_std": 0.0, "grad_norm": 1.265625, "kl": 0.021053237840533257, "learning_rate": 7.2476e-06, "loss": 0.1207, "num_tokens": 87042786.0, "reward": 2.95101261138916, "reward_std": 0.061616383492946625, "rewards/reward_fn/mean": 2.95101261138916, "rewards/reward_fn/std": 0.061616357415914536, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1802.0, "completions/mean_length": 692.28125, "completions/mean_terminated_length": 648.54833984375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.19974541211414024, "frac_reward_zero_std": 0.0, "grad_norm": 1.234375, "kl": 0.014040675945580006, "learning_rate": 7.2472e-06, "loss": 0.2063, "num_tokens": 87110667.0, "reward": 3.5004072189331055, "reward_std": 1.0367085933685303, "rewards/reward_fn/mean": 3.5004072189331055, "rewards/reward_fn/std": 1.0367085933685303, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 233.75, "completions/mean_terminated_length": 233.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.19985149039991515, "frac_reward_zero_std": 0.0, "grad_norm": 1.9453125, "kl": 0.03067116648890078, "learning_rate": 7.2468e-06, "loss": 0.0167, "num_tokens": 87171843.0, "reward": 3.351804733276367, "reward_std": 0.448964923620224, "rewards/reward_fn/mean": 3.351804733276367, "rewards/reward_fn/std": 0.448964923620224, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 567.3125, "completions/mean_terminated_length": 519.5484008789062, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.19995756868569003, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.0241457661613822, "learning_rate": 7.2464e-06, "loss": 0.0926, "num_tokens": 87223469.0, "reward": 2.813939094543457, "reward_std": 0.8173084855079651, "rewards/reward_fn/mean": 2.813939094543457, "rewards/reward_fn/std": 0.8173085451126099, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1567.0, "completions/max_terminated_length": 1567.0, "completions/mean_length": 442.90625, "completions/mean_terminated_length": 442.90625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.20006364697146495, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.024024329613894224, "learning_rate": 7.246e-06, "loss": 0.2072, "num_tokens": 87270762.0, "reward": 2.7583117485046387, "reward_std": 0.26737165451049805, "rewards/reward_fn/mean": 2.7583117485046387, "rewards/reward_fn/std": 0.26737165451049805, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1191.0, "completions/max_terminated_length": 1191.0, "completions/mean_length": 361.5625, "completions/mean_terminated_length": 361.5625, "completions/min_length": 37.0, "completions/min_terminated_length": 37.0, "epoch": 0.20016972525723983, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.026753748068585992, "learning_rate": 7.245599999999999e-06, "loss": -0.1456, "num_tokens": 87316988.0, "reward": 2.71528959274292, "reward_std": 0.49664467573165894, "rewards/reward_fn/mean": 2.71528959274292, "rewards/reward_fn/std": 0.49664464592933655, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 196.3125, "completions/mean_terminated_length": 196.3125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.20027580354301475, "frac_reward_zero_std": 1.0, "grad_norm": 0.1162109375, "kl": 0.02518186066299677, "learning_rate": 7.2452e-06, "loss": 0.001, "num_tokens": 87361126.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 223.0, "completions/mean_terminated_length": 223.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.20038188182878966, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.03339630598202348, "learning_rate": 7.2448e-06, "loss": 0.0036, "num_tokens": 87409702.0, "reward": 3.3883249759674072, "reward_std": 0.5844486355781555, "rewards/reward_fn/mean": 3.3883249759674072, "rewards/reward_fn/std": 0.5844485759735107, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1150.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 362.40625, "completions/mean_terminated_length": 362.40625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.20048796011456455, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.03504353458993137, "learning_rate": 7.2444e-06, "loss": 0.0972, "num_tokens": 87458163.0, "reward": 3.851999282836914, "reward_std": 0.351296991109848, "rewards/reward_fn/mean": 3.851999282836914, "rewards/reward_fn/std": 0.35129696130752563, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 273.3125, "completions/mean_terminated_length": 273.3125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.20059403840033946, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.028511138632893562, "learning_rate": 7.244e-06, "loss": 0.0735, "num_tokens": 87498813.0, "reward": 3.1870431900024414, "reward_std": 0.5579171776771545, "rewards/reward_fn/mean": 3.1870431900024414, "rewards/reward_fn/std": 0.5579171776771545, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1480.0, "completions/max_terminated_length": 1480.0, "completions/mean_length": 343.09375, "completions/mean_terminated_length": 343.09375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.20070011668611434, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.024244441650807858, "learning_rate": 7.2435999999999996e-06, "loss": 0.0012, "num_tokens": 87523360.0, "reward": 3.2999448776245117, "reward_std": 0.7095201015472412, "rewards/reward_fn/mean": 3.2999448776245117, "rewards/reward_fn/std": 0.7095201015472412, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 260.5, "completions/mean_terminated_length": 260.5, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.20080619497188926, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.027942111948505044, "learning_rate": 7.2431999999999995e-06, "loss": 0.0428, "num_tokens": 87573488.0, "reward": 2.9267218112945557, "reward_std": 0.04453163594007492, "rewards/reward_fn/mean": 2.9267218112945557, "rewards/reward_fn/std": 0.04453163221478462, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 259.25, "completions/mean_terminated_length": 259.25, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.20091227325766414, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.02555936831049621, "learning_rate": 7.2427999999999995e-06, "loss": 0.0033, "num_tokens": 87616728.0, "reward": 3.8891286849975586, "reward_std": 0.2994978129863739, "rewards/reward_fn/mean": 3.8891286849975586, "rewards/reward_fn/std": 0.2994977831840515, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 133.0, "completions/max_terminated_length": 133.0, "completions/mean_length": 87.6875, "completions/mean_terminated_length": 87.6875, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "epoch": 0.20101835154343906, "frac_reward_zero_std": 1.0, "grad_norm": 0.1533203125, "kl": 0.026342453667894006, "learning_rate": 7.2423999999999995e-06, "loss": 0.0011, "num_tokens": 87654158.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 492.625, "completions/mean_terminated_length": 442.45159912109375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.20112442982921397, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.03021831950172782, "learning_rate": 7.2419999999999994e-06, "loss": 0.2815, "num_tokens": 87704802.0, "reward": 2.7129340171813965, "reward_std": 0.6130638718605042, "rewards/reward_fn/mean": 2.7129340171813965, "rewards/reward_fn/std": 0.6130638122558594, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 184.53125, "completions/mean_terminated_length": 184.53125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.20123050811498885, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.03523435001261532, "learning_rate": 7.241599999999999e-06, "loss": -0.0579, "num_tokens": 87740019.0, "reward": 2.7257957458496094, "reward_std": 0.22360388934612274, "rewards/reward_fn/mean": 2.7257957458496094, "rewards/reward_fn/std": 0.22360387444496155, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 176.0, "completions/max_terminated_length": 176.0, "completions/mean_length": 96.75, "completions/mean_terminated_length": 96.75, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.20133658640076377, "frac_reward_zero_std": 1.0, "grad_norm": 0.150390625, "kl": 0.02294772327877581, "learning_rate": 7.241199999999999e-06, "loss": 0.0009, "num_tokens": 87785803.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 138.9375, "completions/mean_terminated_length": 138.9375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.20144266468653865, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.01800387806724757, "learning_rate": 7.2408e-06, "loss": 0.0354, "num_tokens": 87826473.0, "reward": 3.935486316680908, "reward_std": 0.25387680530548096, "rewards/reward_fn/mean": 3.935486316680908, "rewards/reward_fn/std": 0.25387680530548096, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 369.71875, "completions/mean_terminated_length": 369.71875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.20154874297231357, "frac_reward_zero_std": 0.0, "grad_norm": 1.578125, "kl": 0.020978798624128103, "learning_rate": 7.2404e-06, "loss": -0.0053, "num_tokens": 87872256.0, "reward": 2.899190664291382, "reward_std": 0.29119160771369934, "rewards/reward_fn/mean": 2.899190664291382, "rewards/reward_fn/std": 0.29119154810905457, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 101.96875, "completions/mean_terminated_length": 101.96875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "epoch": 0.20165482125808848, "frac_reward_zero_std": 1.0, "grad_norm": 0.11181640625, "kl": 0.02071715716738254, "learning_rate": 7.24e-06, "loss": 0.0008, "num_tokens": 87902847.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 192.15625, "completions/mean_terminated_length": 192.15625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.20176089954386336, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.02114151930436492, "learning_rate": 7.2396e-06, "loss": 0.0038, "num_tokens": 87947396.0, "reward": 3.9619932174682617, "reward_std": 0.2149982750415802, "rewards/reward_fn/mean": 3.9619932174682617, "rewards/reward_fn/std": 0.2149982899427414, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 135.125, "completions/mean_terminated_length": 135.125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.20186697782963828, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.020640159607864916, "learning_rate": 7.2392e-06, "loss": 0.1303, "num_tokens": 87990408.0, "reward": 2.981785774230957, "reward_std": 0.03771773725748062, "rewards/reward_fn/mean": 2.981785774230957, "rewards/reward_fn/std": 0.037717726081609726, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 382.84375, "completions/mean_terminated_length": 382.84375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.20197305611541316, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.022791220573708415, "learning_rate": 7.2388e-06, "loss": 0.0422, "num_tokens": 88035939.0, "reward": 3.544314384460449, "reward_std": 0.5268944501876831, "rewards/reward_fn/mean": 3.544314384460449, "rewards/reward_fn/std": 0.5268945097923279, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1388.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.20207913440118808, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.021082836901769042, "learning_rate": 7.2384e-06, "loss": 0.0008, "num_tokens": 88080631.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1928.0, "completions/max_terminated_length": 1928.0, "completions/mean_length": 536.375, "completions/mean_terminated_length": 536.375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.202185212686963, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.023393918527290225, "learning_rate": 7.238e-06, "loss": -0.0174, "num_tokens": 88133123.0, "reward": 2.6614913940429688, "reward_std": 0.3483014404773712, "rewards/reward_fn/mean": 2.6614913940429688, "rewards/reward_fn/std": 0.34830138087272644, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1456.0, "completions/mean_length": 763.9375, "completions/mean_terminated_length": 722.51611328125, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "epoch": 0.20229129097273787, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.016707264934666455, "learning_rate": 7.237599999999999e-06, "loss": 0.0774, "num_tokens": 88201729.0, "reward": 2.7706775665283203, "reward_std": 0.7481005787849426, "rewards/reward_fn/mean": 2.7706775665283203, "rewards/reward_fn/std": 0.7481005787849426, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 193.75, "completions/mean_terminated_length": 193.75, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.2023973692585128, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.026929725194349885, "learning_rate": 7.237199999999999e-06, "loss": 0.0431, "num_tokens": 88235385.0, "reward": 3.965723991394043, "reward_std": 0.19389450550079346, "rewards/reward_fn/mean": 3.965723991394043, "rewards/reward_fn/std": 0.19389450550079346, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 181.1875, "completions/mean_terminated_length": 181.1875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.20250344754428767, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.026674387976527214, "learning_rate": 7.236799999999999e-06, "loss": 0.02, "num_tokens": 88278111.0, "reward": 3.9655656814575195, "reward_std": 0.19478978216648102, "rewards/reward_fn/mean": 3.9655656814575195, "rewards/reward_fn/std": 0.19478978216648102, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 374.9375, "completions/mean_terminated_length": 374.9375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.2026095258300626, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.01570185914169997, "learning_rate": 7.236399999999999e-06, "loss": 0.0236, "num_tokens": 88323005.0, "reward": 2.7527647018432617, "reward_std": 0.04668011888861656, "rewards/reward_fn/mean": 2.7527647018432617, "rewards/reward_fn/std": 0.04668007418513298, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 85.625, "completions/mean_terminated_length": 85.625, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.2027156041158375, "frac_reward_zero_std": 1.0, "grad_norm": 0.1025390625, "kl": 0.01357071875827387, "learning_rate": 7.236e-06, "loss": 0.0005, "num_tokens": 88352881.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 265.125, "completions/mean_terminated_length": 265.125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.20282168240161239, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.02683391165919602, "learning_rate": 7.2356e-06, "loss": -0.0249, "num_tokens": 88376053.0, "reward": 2.970895290374756, "reward_std": 0.605861485004425, "rewards/reward_fn/mean": 2.970895290374756, "rewards/reward_fn/std": 0.605861485004425, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 226.9375, "completions/mean_terminated_length": 226.9375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.2029277606873873, "frac_reward_zero_std": 1.0, "grad_norm": 0.10009765625, "kl": 0.018975378945469856, "learning_rate": 7.2352e-06, "loss": 0.0008, "num_tokens": 88404915.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 235.1875, "completions/mean_terminated_length": 235.1875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.20303383897316218, "frac_reward_zero_std": 1.0, "grad_norm": 0.07763671875, "kl": 0.021373262396082282, "learning_rate": 7.2348e-06, "loss": 0.0009, "num_tokens": 88457657.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 377.4375, "completions/mean_terminated_length": 377.4375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.2031399172589371, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.019720564829185605, "learning_rate": 7.2344e-06, "loss": -0.0058, "num_tokens": 88506119.0, "reward": 3.7221968173980713, "reward_std": 0.7467592358589172, "rewards/reward_fn/mean": 3.7221968173980713, "rewards/reward_fn/std": 0.7467593550682068, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1589.0, "completions/max_terminated_length": 1589.0, "completions/mean_length": 450.9375, "completions/mean_terminated_length": 450.9375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.203245995544712, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.01790036354213953, "learning_rate": 7.234e-06, "loss": -0.0531, "num_tokens": 88541797.0, "reward": 2.7362747192382812, "reward_std": 0.4555523097515106, "rewards/reward_fn/mean": 2.7362747192382812, "rewards/reward_fn/std": 0.4555523693561554, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 329.84375, "completions/mean_terminated_length": 329.84375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.2033520738304869, "frac_reward_zero_std": 0.0, "grad_norm": 1.3984375, "kl": 0.017196921980939806, "learning_rate": 7.2336e-06, "loss": -0.0422, "num_tokens": 88603456.0, "reward": 3.38254451751709, "reward_std": 0.9729898571968079, "rewards/reward_fn/mean": 3.38254451751709, "rewards/reward_fn/std": 0.9729898571968079, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 220.875, "completions/mean_terminated_length": 220.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.2034581521162618, "frac_reward_zero_std": 0.0, "grad_norm": 1.7890625, "kl": 0.023815185064449906, "learning_rate": 7.2332e-06, "loss": -0.014, "num_tokens": 88651740.0, "reward": 3.157181739807129, "reward_std": 0.4949103593826294, "rewards/reward_fn/mean": 3.157181739807129, "rewards/reward_fn/std": 0.4949103593826294, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 605.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 202.375, "completions/mean_terminated_length": 202.375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.2035642304020367, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.027365016983821988, "learning_rate": 7.2328e-06, "loss": 0.0196, "num_tokens": 88678088.0, "reward": 3.5471441745758057, "reward_std": 0.5224708914756775, "rewards/reward_fn/mean": 3.5471441745758057, "rewards/reward_fn/std": 0.5224708914756775, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 690.8125, "completions/mean_terminated_length": 550.413818359375, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.2036703086878116, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.02398996870033443, "learning_rate": 7.2323999999999996e-06, "loss": 0.1143, "num_tokens": 88753698.0, "reward": 1.9489907026290894, "reward_std": 0.5881763696670532, "rewards/reward_fn/mean": 1.9489907026290894, "rewards/reward_fn/std": 0.5881763696670532, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 931.0, "completions/max_terminated_length": 931.0, "completions/mean_length": 315.375, "completions/mean_terminated_length": 315.375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.2037763869735865, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.024634240893647075, "learning_rate": 7.2319999999999995e-06, "loss": 0.1392, "num_tokens": 88810734.0, "reward": 3.686619281768799, "reward_std": 0.5119209289550781, "rewards/reward_fn/mean": 3.686619281768799, "rewards/reward_fn/std": 0.5119208693504333, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1063.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 240.15625, "completions/mean_terminated_length": 240.15625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.2038824652593614, "frac_reward_zero_std": 0.0, "grad_norm": 1.859375, "kl": 0.02241891936864704, "learning_rate": 7.2315999999999995e-06, "loss": 0.0191, "num_tokens": 88852499.0, "reward": 3.011032819747925, "reward_std": 0.3259943425655365, "rewards/reward_fn/mean": 3.011032819747925, "rewards/reward_fn/std": 0.3259943425655365, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 310.5, "completions/mean_terminated_length": 310.5, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.20398854354513632, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.022662291070446372, "learning_rate": 7.2312e-06, "loss": 0.0401, "num_tokens": 88901475.0, "reward": 3.343940258026123, "reward_std": 0.7196161150932312, "rewards/reward_fn/mean": 3.343940258026123, "rewards/reward_fn/std": 0.7196161150932312, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 229.6875, "completions/mean_terminated_length": 229.6875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.2040946218309112, "frac_reward_zero_std": 0.0, "grad_norm": 1.734375, "kl": 0.027691754046827555, "learning_rate": 7.2308e-06, "loss": 0.0728, "num_tokens": 88939737.0, "reward": 3.8904521465301514, "reward_std": 0.34605592489242554, "rewards/reward_fn/mean": 3.8904521465301514, "rewards/reward_fn/std": 0.34605586528778076, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 129.96875, "completions/mean_terminated_length": 129.96875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.20420070011668612, "frac_reward_zero_std": 1.0, "grad_norm": 0.11865234375, "kl": 0.022279798751696944, "learning_rate": 7.2304e-06, "loss": 0.0009, "num_tokens": 88987640.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 299.75, "completions/mean_terminated_length": 299.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.204306778402461, "frac_reward_zero_std": 0.0, "grad_norm": 1.640625, "kl": 0.026795195881277323, "learning_rate": 7.23e-06, "loss": 0.0406, "num_tokens": 89030320.0, "reward": 2.822610378265381, "reward_std": 0.061767082661390305, "rewards/reward_fn/mean": 2.822610378265381, "rewards/reward_fn/std": 0.06176706776022911, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 260.1875, "completions/mean_terminated_length": 260.1875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.20441285668823592, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.02193640125915408, "learning_rate": 7.229599999999999e-06, "loss": 0.0855, "num_tokens": 89073910.0, "reward": 3.5822935104370117, "reward_std": 0.5866038799285889, "rewards/reward_fn/mean": 3.5822935104370117, "rewards/reward_fn/std": 0.5866038799285889, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1132.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 391.15625, "completions/mean_terminated_length": 391.15625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.20451893497401083, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.02066147024743259, "learning_rate": 7.229199999999999e-06, "loss": 0.0274, "num_tokens": 89120123.0, "reward": 3.036647319793701, "reward_std": 0.44018828868865967, "rewards/reward_fn/mean": 3.036647319793701, "rewards/reward_fn/std": 0.4401882588863373, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 418.71875, "completions/mean_terminated_length": 418.71875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.20462501325978572, "frac_reward_zero_std": 0.0, "grad_norm": 1.140625, "kl": 0.018091494566760957, "learning_rate": 7.228799999999999e-06, "loss": 0.0708, "num_tokens": 89180338.0, "reward": 3.9663846492767334, "reward_std": 0.19015701115131378, "rewards/reward_fn/mean": 3.9663846492767334, "rewards/reward_fn/std": 0.19015701115131378, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 228.0, "completions/max_terminated_length": 228.0, "completions/mean_length": 166.59375, "completions/mean_terminated_length": 166.59375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.20473109154556063, "frac_reward_zero_std": 1.0, "grad_norm": 0.09033203125, "kl": 0.020806429674848914, "learning_rate": 7.228399999999999e-06, "loss": 0.0008, "num_tokens": 89217669.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1150.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 289.3125, "completions/mean_terminated_length": 289.3125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.2048371698313355, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.02973605995066464, "learning_rate": 7.227999999999999e-06, "loss": 0.0377, "num_tokens": 89254255.0, "reward": 2.971668243408203, "reward_std": 0.39661845564842224, "rewards/reward_fn/mean": 2.971668243408203, "rewards/reward_fn/std": 0.39661842584609985, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 73.0625, "completions/mean_terminated_length": 73.0625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.20494324811711043, "frac_reward_zero_std": 1.0, "grad_norm": 0.123046875, "kl": 0.01642139005707577, "learning_rate": 7.227599999999999e-06, "loss": 0.0007, "num_tokens": 89277937.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 182.625, "completions/mean_terminated_length": 182.625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.20504932640288534, "frac_reward_zero_std": 1.0, "grad_norm": 0.10791015625, "kl": 0.022830116329714656, "learning_rate": 7.227199999999999e-06, "loss": 0.0009, "num_tokens": 89323109.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 358.59375, "completions/mean_terminated_length": 358.59375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.20515540468866023, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.025236302288249135, "learning_rate": 7.226799999999999e-06, "loss": 0.0009, "num_tokens": 89368024.0, "reward": 3.8563036918640137, "reward_std": 0.48152342438697815, "rewards/reward_fn/mean": 3.8563036918640137, "rewards/reward_fn/std": 0.48152339458465576, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 113.4375, "completions/mean_terminated_length": 113.4375, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.20526148297443514, "frac_reward_zero_std": 0.0, "grad_norm": 2.90625, "kl": 0.025299014407210052, "learning_rate": 7.2264e-06, "loss": -0.0658, "num_tokens": 89409446.0, "reward": 3.0979630947113037, "reward_std": 1.1084299087524414, "rewards/reward_fn/mean": 3.0979630947113037, "rewards/reward_fn/std": 1.1084297895431519, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 205.46875, "completions/mean_terminated_length": 205.46875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.20536756126021002, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.022546561784110963, "learning_rate": 7.226e-06, "loss": 0.0005, "num_tokens": 89460885.0, "reward": 3.048456907272339, "reward_std": 0.3682403266429901, "rewards/reward_fn/mean": 3.048456907272339, "rewards/reward_fn/std": 0.3682402968406677, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 530.9375, "completions/mean_terminated_length": 530.9375, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.20547363954598494, "frac_reward_zero_std": 0.0, "grad_norm": 1.2578125, "kl": 0.021520751295611262, "learning_rate": 7.2256e-06, "loss": -0.0159, "num_tokens": 89514739.0, "reward": 3.869535207748413, "reward_std": 0.31333673000335693, "rewards/reward_fn/mean": 3.869535207748413, "rewards/reward_fn/std": 0.31333670020103455, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 213.75, "completions/mean_terminated_length": 213.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.20557971783175985, "frac_reward_zero_std": 1.0, "grad_norm": 0.0859375, "kl": 0.01827068265993148, "learning_rate": 7.2252e-06, "loss": 0.0007, "num_tokens": 89556011.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 124.0, "completions/max_terminated_length": 124.0, "completions/mean_length": 70.53125, "completions/mean_terminated_length": 70.53125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.20568579611753474, "frac_reward_zero_std": 0.0, "grad_norm": 4.8125, "kl": 0.02340351662132889, "learning_rate": 7.2248e-06, "loss": -0.096, "num_tokens": 89607260.0, "reward": 3.724715232849121, "reward_std": 0.4149995446205139, "rewards/reward_fn/mean": 3.724715232849121, "rewards/reward_fn/std": 0.41499951481819153, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 94.125, "completions/mean_terminated_length": 94.125, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.20579187440330965, "frac_reward_zero_std": 1.0, "grad_norm": 0.1220703125, "kl": 0.017246187082491815, "learning_rate": 7.2244e-06, "loss": 0.0007, "num_tokens": 89646368.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 359.59375, "completions/mean_terminated_length": 359.59375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.20589795268908453, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.022477731574326754, "learning_rate": 7.224e-06, "loss": 0.0377, "num_tokens": 89695955.0, "reward": 3.152523994445801, "reward_std": 0.6828972697257996, "rewards/reward_fn/mean": 3.152523994445801, "rewards/reward_fn/std": 0.6828973293304443, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 238.0, "completions/mean_terminated_length": 238.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.20600403097485945, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.025746502447873354, "learning_rate": 7.2236e-06, "loss": -0.1531, "num_tokens": 89738867.0, "reward": 3.314828872680664, "reward_std": 0.5770388841629028, "rewards/reward_fn/mean": 3.314828872680664, "rewards/reward_fn/std": 0.5770388841629028, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 340.0625, "completions/mean_terminated_length": 340.0625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.20611010926063436, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.017551672644913197, "learning_rate": 7.2232e-06, "loss": 0.1222, "num_tokens": 89787157.0, "reward": 3.962864875793457, "reward_std": 0.21006862819194794, "rewards/reward_fn/mean": 3.962864875793457, "rewards/reward_fn/std": 0.21006861329078674, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1634.0, "completions/mean_length": 458.9375, "completions/mean_terminated_length": 407.6773986816406, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.20621618754640925, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.018971068551763892, "learning_rate": 7.2228e-06, "loss": 0.2706, "num_tokens": 89837971.0, "reward": 3.80379319190979, "reward_std": 0.8024195432662964, "rewards/reward_fn/mean": 3.80379319190979, "rewards/reward_fn/std": 0.8024195432662964, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 272.4375, "completions/mean_terminated_length": 272.4375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.20632226583218416, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.026358928764238954, "learning_rate": 7.2224e-06, "loss": -0.0807, "num_tokens": 89877921.0, "reward": 3.8525662422180176, "reward_std": 0.39788663387298584, "rewards/reward_fn/mean": 3.8525662422180176, "rewards/reward_fn/std": 0.39788660407066345, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1134.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 303.8125, "completions/mean_terminated_length": 303.8125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.20642834411795905, "frac_reward_zero_std": 0.0, "grad_norm": 1.3828125, "kl": 0.02223000884987414, "learning_rate": 7.2220000000000005e-06, "loss": -0.1545, "num_tokens": 89947163.0, "reward": 3.1230030059814453, "reward_std": 0.41615304350852966, "rewards/reward_fn/mean": 3.1230030059814453, "rewards/reward_fn/std": 0.4161530137062073, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 252.90625, "completions/mean_terminated_length": 252.90625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.20653442240373396, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.026710605947300792, "learning_rate": 7.2216e-06, "loss": 0.0586, "num_tokens": 89984600.0, "reward": 3.95160174369812, "reward_std": 0.1915348619222641, "rewards/reward_fn/mean": 3.95160174369812, "rewards/reward_fn/std": 0.1915348470211029, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 299.71875, "completions/mean_terminated_length": 299.71875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.20664050068950887, "frac_reward_zero_std": 0.0, "grad_norm": 1.5703125, "kl": 0.02253810246475041, "learning_rate": 7.2211999999999996e-06, "loss": 0.0526, "num_tokens": 90031375.0, "reward": 3.298675537109375, "reward_std": 0.590697169303894, "rewards/reward_fn/mean": 3.298675537109375, "rewards/reward_fn/std": 0.590697169303894, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 257.4375, "completions/mean_terminated_length": 257.4375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.20674657897528376, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.02902632998302579, "learning_rate": 7.2207999999999995e-06, "loss": 0.0865, "num_tokens": 90083869.0, "reward": 3.1455907821655273, "reward_std": 0.08136258274316788, "rewards/reward_fn/mean": 3.1455907821655273, "rewards/reward_fn/std": 0.08136259019374847, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 172.09375, "completions/mean_terminated_length": 172.09375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.20685265726105867, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.018278813688084483, "learning_rate": 7.2203999999999995e-06, "loss": 0.0007, "num_tokens": 90104640.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 359.125, "completions/mean_terminated_length": 359.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.20695873554683356, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.02860428229905665, "learning_rate": 7.2199999999999995e-06, "loss": 0.048, "num_tokens": 90178148.0, "reward": 3.1251840591430664, "reward_std": 0.5875481367111206, "rewards/reward_fn/mean": 3.1251840591430664, "rewards/reward_fn/std": 0.5875481367111206, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 159.5625, "completions/mean_terminated_length": 159.5625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.20706481383260847, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.025235574459657073, "learning_rate": 7.2195999999999995e-06, "loss": -0.0463, "num_tokens": 90223062.0, "reward": 3.6346235275268555, "reward_std": 0.5507158041000366, "rewards/reward_fn/mean": 3.6346235275268555, "rewards/reward_fn/std": 0.5507158041000366, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 256.8125, "completions/mean_terminated_length": 256.8125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.20717089211838335, "frac_reward_zero_std": 1.0, "grad_norm": 0.07958984375, "kl": 0.020091851707547903, "learning_rate": 7.219199999999999e-06, "loss": 0.0008, "num_tokens": 90267120.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 177.78125, "completions/mean_terminated_length": 177.78125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.20727697040415827, "frac_reward_zero_std": 1.0, "grad_norm": 0.16015625, "kl": 0.02827597805298865, "learning_rate": 7.218799999999999e-06, "loss": 0.0011, "num_tokens": 90325865.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1240.0, "completions/max_terminated_length": 1240.0, "completions/mean_length": 378.53125, "completions/mean_terminated_length": 378.53125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.20738304868993318, "frac_reward_zero_std": 0.0, "grad_norm": 1.3515625, "kl": 0.02135020843707025, "learning_rate": 7.218399999999999e-06, "loss": 0.1212, "num_tokens": 90372186.0, "reward": 3.716845750808716, "reward_std": 0.5746200084686279, "rewards/reward_fn/mean": 3.716845750808716, "rewards/reward_fn/std": 0.5746200084686279, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 290.375, "completions/mean_terminated_length": 290.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.20748912697570807, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.022205424727872014, "learning_rate": 7.217999999999999e-06, "loss": 0.0699, "num_tokens": 90414950.0, "reward": 3.3851747512817383, "reward_std": 0.5178138017654419, "rewards/reward_fn/mean": 3.3851747512817383, "rewards/reward_fn/std": 0.5178138613700867, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 263.96875, "completions/mean_terminated_length": 263.96875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.20759520526148298, "frac_reward_zero_std": 0.0, "grad_norm": 1.8203125, "kl": 0.024157197680324316, "learning_rate": 7.217599999999999e-06, "loss": 0.0051, "num_tokens": 90440677.0, "reward": 3.5730175971984863, "reward_std": 0.606890082359314, "rewards/reward_fn/mean": 3.5730175971984863, "rewards/reward_fn/std": 0.6068900227546692, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 158.15625, "completions/mean_terminated_length": 158.15625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.20770128354725786, "frac_reward_zero_std": 1.0, "grad_norm": 0.1318359375, "kl": 0.028684925520792603, "learning_rate": 7.2172e-06, "loss": 0.0011, "num_tokens": 90494474.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1158.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 412.21875, "completions/mean_terminated_length": 412.21875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.20780736183303278, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.01932144328020513, "learning_rate": 7.2168e-06, "loss": 0.0304, "num_tokens": 90545521.0, "reward": 3.859956741333008, "reward_std": 0.5511422157287598, "rewards/reward_fn/mean": 3.859956741333008, "rewards/reward_fn/std": 0.5511422157287598, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 437.4375, "completions/mean_terminated_length": 437.4375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.2079134401188077, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.01740968832746148, "learning_rate": 7.2164e-06, "loss": -0.0076, "num_tokens": 90596639.0, "reward": 2.462031841278076, "reward_std": 0.5236942172050476, "rewards/reward_fn/mean": 2.462031841278076, "rewards/reward_fn/std": 0.5236942172050476, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 245.875, "completions/mean_terminated_length": 245.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.20801951840458258, "frac_reward_zero_std": 0.0, "grad_norm": 2.328125, "kl": 0.021013896446675062, "learning_rate": 7.216e-06, "loss": 0.2187, "num_tokens": 90631643.0, "reward": 3.968956470489502, "reward_std": 0.17560802400112152, "rewards/reward_fn/mean": 3.968956470489502, "rewards/reward_fn/std": 0.17560799419879913, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 127.25, "completions/mean_terminated_length": 127.25, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.2081255966903575, "frac_reward_zero_std": 1.0, "grad_norm": 0.1240234375, "kl": 0.02872321312315762, "learning_rate": 7.2156e-06, "loss": 0.0011, "num_tokens": 90663715.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 162.03125, "completions/mean_terminated_length": 162.03125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.20823167497613237, "frac_reward_zero_std": 0.0, "grad_norm": 1.890625, "kl": 0.022123705130070448, "learning_rate": 7.2152e-06, "loss": 0.0121, "num_tokens": 90700004.0, "reward": 3.959567070007324, "reward_std": 0.22872252762317657, "rewards/reward_fn/mean": 3.959567070007324, "rewards/reward_fn/std": 0.22872251272201538, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1812.0, "completions/max_terminated_length": 1812.0, "completions/mean_length": 307.46875, "completions/mean_terminated_length": 307.46875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.2083377532619073, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.029133395524695516, "learning_rate": 7.2148e-06, "loss": 0.1083, "num_tokens": 90749555.0, "reward": 3.5827994346618652, "reward_std": 0.5858627557754517, "rewards/reward_fn/mean": 3.5827994346618652, "rewards/reward_fn/std": 0.5858627557754517, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 389.84375, "completions/mean_terminated_length": 389.84375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.2084438315476822, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484375, "kl": 0.018395462189801037, "learning_rate": 7.2144e-06, "loss": 0.0121, "num_tokens": 90804846.0, "reward": 2.98877215385437, "reward_std": 0.03333742171525955, "rewards/reward_fn/mean": 2.98877215385437, "rewards/reward_fn/std": 0.033337417989969254, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 160.375, "completions/mean_terminated_length": 160.375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.2085499098334571, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234375, "kl": 0.01872719032689929, "learning_rate": 7.214e-06, "loss": 0.0007, "num_tokens": 90850298.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 73.65625, "completions/mean_terminated_length": 73.65625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.208655988119232, "frac_reward_zero_std": 1.0, "grad_norm": 0.11767578125, "kl": 0.015950615401379764, "learning_rate": 7.213599999999999e-06, "loss": 0.0006, "num_tokens": 90886063.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 285.78125, "completions/mean_terminated_length": 285.78125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.20876206640500689, "frac_reward_zero_std": 0.0, "grad_norm": 1.1171875, "kl": 0.020902880001813173, "learning_rate": 7.213199999999999e-06, "loss": 0.0817, "num_tokens": 90931080.0, "reward": 3.021613597869873, "reward_std": 0.18543782830238342, "rewards/reward_fn/mean": 3.021613597869873, "rewards/reward_fn/std": 0.18543781340122223, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 282.34375, "completions/mean_terminated_length": 282.34375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.2088681446907818, "frac_reward_zero_std": 0.0, "grad_norm": 1.515625, "kl": 0.01968630903866142, "learning_rate": 7.212799999999999e-06, "loss": 0.0353, "num_tokens": 90989203.0, "reward": 2.8288135528564453, "reward_std": 0.04306629300117493, "rewards/reward_fn/mean": 2.8288135528564453, "rewards/reward_fn/std": 0.04306626692414284, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1025.0, "completions/max_terminated_length": 1025.0, "completions/mean_length": 227.25, "completions/mean_terminated_length": 227.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.2089742229765567, "frac_reward_zero_std": 0.0, "grad_norm": 1.6875, "kl": 0.018828594125807285, "learning_rate": 7.2124e-06, "loss": 0.0222, "num_tokens": 91047163.0, "reward": 3.745974063873291, "reward_std": 0.4881126582622528, "rewards/reward_fn/mean": 3.745974063873291, "rewards/reward_fn/std": 0.4881126284599304, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1225.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 414.4375, "completions/mean_terminated_length": 414.4375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.2090803012623316, "frac_reward_zero_std": 0.0, "grad_norm": 1.5, "kl": 0.021128013264387846, "learning_rate": 7.212e-06, "loss": 0.0099, "num_tokens": 91107625.0, "reward": 3.6306121349334717, "reward_std": 0.7473952174186707, "rewards/reward_fn/mean": 3.6306121349334717, "rewards/reward_fn/std": 0.7473952770233154, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 283.71875, "completions/mean_terminated_length": 283.71875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.2091863795481065, "frac_reward_zero_std": 1.0, "grad_norm": 0.0654296875, "kl": 0.014698807732202113, "learning_rate": 7.2116e-06, "loss": 0.0006, "num_tokens": 91172608.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 321.625, "completions/mean_terminated_length": 321.625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.2092924578338814, "frac_reward_zero_std": 0.0, "grad_norm": 2.03125, "kl": 0.021146057173609734, "learning_rate": 7.2112e-06, "loss": -0.0184, "num_tokens": 91230484.0, "reward": 3.395284414291382, "reward_std": 0.5098738074302673, "rewards/reward_fn/mean": 3.395284414291382, "rewards/reward_fn/std": 0.5098738074302673, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 100.03125, "completions/mean_terminated_length": 100.03125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.2093985361196563, "frac_reward_zero_std": 1.0, "grad_norm": 0.1376953125, "kl": 0.0216621074359864, "learning_rate": 7.2108e-06, "loss": 0.0009, "num_tokens": 91264469.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 542.25, "completions/mean_terminated_length": 493.6773986816406, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.20950461440543122, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.01485793653409928, "learning_rate": 7.2104e-06, "loss": 0.1539, "num_tokens": 91328349.0, "reward": 3.7629165649414062, "reward_std": 0.7722904682159424, "rewards/reward_fn/mean": 3.7629165649414062, "rewards/reward_fn/std": 0.7722904682159424, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 108.21875, "completions/mean_terminated_length": 108.21875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.2096106926912061, "frac_reward_zero_std": 0.0, "grad_norm": 2.828125, "kl": 0.018597336602397263, "learning_rate": 7.21e-06, "loss": -0.0098, "num_tokens": 91363300.0, "reward": 3.804640293121338, "reward_std": 0.34462565183639526, "rewards/reward_fn/mean": 3.804640293121338, "rewards/reward_fn/std": 0.3446256220340729, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 859.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 279.78125, "completions/mean_terminated_length": 279.78125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.20971677097698102, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.01667685038410127, "learning_rate": 7.2095999999999995e-06, "loss": 0.1173, "num_tokens": 91388605.0, "reward": 3.961299180984497, "reward_std": 0.2189248949289322, "rewards/reward_fn/mean": 3.961299180984497, "rewards/reward_fn/std": 0.2189248949289322, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 229.0, "completions/mean_terminated_length": 229.0, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.2098228492627559, "frac_reward_zero_std": 1.0, "grad_norm": 0.08203125, "kl": 0.020580741576850414, "learning_rate": 7.2091999999999995e-06, "loss": 0.0008, "num_tokens": 91426109.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 952.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 310.21875, "completions/mean_terminated_length": 310.21875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.20992892754853082, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.02507057785987854, "learning_rate": 7.2087999999999995e-06, "loss": 0.1101, "num_tokens": 91482500.0, "reward": 3.840827465057373, "reward_std": 0.37968137860298157, "rewards/reward_fn/mean": 3.840827465057373, "rewards/reward_fn/std": 0.37968140840530396, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1106.0, "completions/max_terminated_length": 1106.0, "completions/mean_length": 195.875, "completions/mean_terminated_length": 195.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.2100350058343057, "frac_reward_zero_std": 0.0, "grad_norm": 2.4375, "kl": 0.02671630633994937, "learning_rate": 7.2083999999999995e-06, "loss": 0.0589, "num_tokens": 91522784.0, "reward": 2.8789358139038086, "reward_std": 0.2997475862503052, "rewards/reward_fn/mean": 2.8789358139038086, "rewards/reward_fn/std": 0.2997475862503052, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 149.78125, "completions/mean_terminated_length": 149.78125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.21014108412008062, "frac_reward_zero_std": 0.0, "grad_norm": 3.65625, "kl": 0.025148641783744097, "learning_rate": 7.208e-06, "loss": -0.0394, "num_tokens": 91563673.0, "reward": 3.7303476333618164, "reward_std": 0.26785701513290405, "rewards/reward_fn/mean": 3.7303476333618164, "rewards/reward_fn/std": 0.26785698533058167, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 207.4375, "completions/mean_terminated_length": 207.4375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.21024716240585553, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.025888599455356598, "learning_rate": 7.2076e-06, "loss": 0.1438, "num_tokens": 91597991.0, "reward": 3.4232587814331055, "reward_std": 0.546558678150177, "rewards/reward_fn/mean": 3.4232587814331055, "rewards/reward_fn/std": 0.546558678150177, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1835.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 374.125, "completions/mean_terminated_length": 374.125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.21035324069163042, "frac_reward_zero_std": 0.0, "grad_norm": 1.984375, "kl": 0.02524533332325518, "learning_rate": 7.2072e-06, "loss": 0.051, "num_tokens": 91647627.0, "reward": 2.730729103088379, "reward_std": 0.4013659358024597, "rewards/reward_fn/mean": 2.730729103088379, "rewards/reward_fn/std": 0.4013659656047821, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1921.0, "completions/mean_length": 715.15625, "completions/mean_terminated_length": 577.27587890625, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.21045931897740533, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.02549780602566898, "learning_rate": 7.2068e-06, "loss": 0.1905, "num_tokens": 91701296.0, "reward": 2.3230574131011963, "reward_std": 0.7773017883300781, "rewards/reward_fn/mean": 2.3230574131011963, "rewards/reward_fn/std": 0.7773017883300781, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1430.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 395.34375, "completions/mean_terminated_length": 395.34375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.21056539726318022, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.021748070372268558, "learning_rate": 7.2064e-06, "loss": 0.0392, "num_tokens": 91755099.0, "reward": 3.6780052185058594, "reward_std": 0.551239013671875, "rewards/reward_fn/mean": 3.6780052185058594, "rewards/reward_fn/std": 0.551239013671875, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 344.25, "completions/mean_terminated_length": 344.25, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.21067147554895513, "frac_reward_zero_std": 0.0, "grad_norm": 1.71875, "kl": 0.021804936230182648, "learning_rate": 7.206e-06, "loss": 0.0594, "num_tokens": 91805603.0, "reward": 2.9031484127044678, "reward_std": 0.046194564551115036, "rewards/reward_fn/mean": 2.9031484127044678, "rewards/reward_fn/std": 0.04619458317756653, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 183.90625, "completions/mean_terminated_length": 183.90625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.21077755383473004, "frac_reward_zero_std": 1.0, "grad_norm": 0.1259765625, "kl": 0.028120714705437422, "learning_rate": 7.205599999999999e-06, "loss": 0.0011, "num_tokens": 91849088.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 273.84375, "completions/mean_terminated_length": 273.84375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.21088363212050493, "frac_reward_zero_std": 0.0, "grad_norm": 2.15625, "kl": 0.02706411969847977, "learning_rate": 7.205199999999999e-06, "loss": 0.0943, "num_tokens": 91893915.0, "reward": 3.962301254272461, "reward_std": 0.21325629949569702, "rewards/reward_fn/mean": 3.962301254272461, "rewards/reward_fn/std": 0.21325626969337463, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 201.0, "completions/mean_terminated_length": 201.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.21098971040627984, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.020172378746792674, "learning_rate": 7.204799999999999e-06, "loss": 0.0691, "num_tokens": 91930811.0, "reward": 2.7958617210388184, "reward_std": 0.03389512747526169, "rewards/reward_fn/mean": 2.7958617210388184, "rewards/reward_fn/std": 0.033895138651132584, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 153.4375, "completions/mean_terminated_length": 153.4375, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.21109578869205473, "frac_reward_zero_std": 0.0, "grad_norm": 3.875, "kl": 0.03439820581115782, "learning_rate": 7.204399999999999e-06, "loss": 0.1005, "num_tokens": 91970377.0, "reward": 3.9778530597686768, "reward_std": 0.12528198957443237, "rewards/reward_fn/mean": 3.9778530597686768, "rewards/reward_fn/std": 0.12528197467327118, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1437.0, "completions/max_terminated_length": 1437.0, "completions/mean_length": 259.09375, "completions/mean_terminated_length": 259.09375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.21120186697782964, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.022125726332888007, "learning_rate": 7.203999999999999e-06, "loss": -0.1815, "num_tokens": 92008268.0, "reward": 3.5986576080322266, "reward_std": 0.49440956115722656, "rewards/reward_fn/mean": 3.5986576080322266, "rewards/reward_fn/std": 0.4944095313549042, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 241.875, "completions/mean_terminated_length": 241.875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.21130794526360455, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.022150580305606127, "learning_rate": 7.203599999999999e-06, "loss": 0.3028, "num_tokens": 92046408.0, "reward": 2.9095382690429688, "reward_std": 0.6566404104232788, "rewards/reward_fn/mean": 2.9095382690429688, "rewards/reward_fn/std": 0.6566404104232788, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 261.25, "completions/mean_terminated_length": 261.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.21141402354937944, "frac_reward_zero_std": 0.0, "grad_norm": 2.140625, "kl": 0.02650432544760406, "learning_rate": 7.2032e-06, "loss": -0.0708, "num_tokens": 92098832.0, "reward": 3.028238296508789, "reward_std": 0.47540026903152466, "rewards/reward_fn/mean": 3.028238296508789, "rewards/reward_fn/std": 0.4754002094268799, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 246.09375, "completions/mean_terminated_length": 246.09375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.21152010183515435, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.025201337644830346, "learning_rate": 7.2028e-06, "loss": 0.1415, "num_tokens": 92144115.0, "reward": 3.697462558746338, "reward_std": 0.6422297954559326, "rewards/reward_fn/mean": 3.697462558746338, "rewards/reward_fn/std": 0.6422297954559326, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 216.5, "completions/mean_terminated_length": 216.5, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.21162618012092924, "frac_reward_zero_std": 1.0, "grad_norm": 0.09228515625, "kl": 0.022127235773950815, "learning_rate": 7.2024e-06, "loss": 0.0009, "num_tokens": 92197027.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1788.0, "completions/max_terminated_length": 1788.0, "completions/mean_length": 335.375, "completions/mean_terminated_length": 335.375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.21173225840670415, "frac_reward_zero_std": 0.0, "grad_norm": 1.84375, "kl": 0.027125931112095714, "learning_rate": 7.202e-06, "loss": 0.1132, "num_tokens": 92261455.0, "reward": 3.7107300758361816, "reward_std": 0.6581941246986389, "rewards/reward_fn/mean": 3.7107300758361816, "rewards/reward_fn/std": 0.6581941246986389, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 324.40625, "completions/mean_terminated_length": 324.40625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.21183833669247906, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.018547830171883106, "learning_rate": 7.2016e-06, "loss": -0.0218, "num_tokens": 92303356.0, "reward": 3.929755687713623, "reward_std": 0.2764107584953308, "rewards/reward_fn/mean": 3.929755687713623, "rewards/reward_fn/std": 0.2764107286930084, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 219.5625, "completions/mean_terminated_length": 219.5625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.21194441497825395, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.016332231694832444, "learning_rate": 7.2012e-06, "loss": 0.0117, "num_tokens": 92341646.0, "reward": 2.746156692504883, "reward_std": 0.042948655784130096, "rewards/reward_fn/mean": 2.746156692504883, "rewards/reward_fn/std": 0.0429486408829689, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1248.0, "completions/max_terminated_length": 1248.0, "completions/mean_length": 303.78125, "completions/mean_terminated_length": 303.78125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.21205049326402886, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.024832285940647125, "learning_rate": 7.2008e-06, "loss": 0.1743, "num_tokens": 92407335.0, "reward": 3.5415878295898438, "reward_std": 0.8036985993385315, "rewards/reward_fn/mean": 3.5415878295898438, "rewards/reward_fn/std": 0.8036985397338867, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 152.71875, "completions/mean_terminated_length": 152.71875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.21215657154980375, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.023261455935426056, "learning_rate": 7.2004e-06, "loss": 0.0307, "num_tokens": 92451390.0, "reward": 3.0803160667419434, "reward_std": 0.3557732403278351, "rewards/reward_fn/mean": 3.0803160667419434, "rewards/reward_fn/std": 0.3557732403278351, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 200.5, "completions/mean_terminated_length": 200.5, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.21226264983557866, "frac_reward_zero_std": 1.0, "grad_norm": 0.0693359375, "kl": 0.01706216251477599, "learning_rate": 7.2e-06, "loss": 0.0007, "num_tokens": 92487726.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 178.0, "completions/max_terminated_length": 178.0, "completions/mean_length": 110.90625, "completions/mean_terminated_length": 110.90625, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.21236872812135357, "frac_reward_zero_std": 0.0, "grad_norm": 3.015625, "kl": 0.04754210542887449, "learning_rate": 7.1996e-06, "loss": 0.1082, "num_tokens": 92525099.0, "reward": 3.045430898666382, "reward_std": 0.03022829256951809, "rewards/reward_fn/mean": 3.045430898666382, "rewards/reward_fn/std": 0.030228327959775925, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 633.5625, "completions/mean_terminated_length": 587.9354858398438, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.21247480640712846, "frac_reward_zero_std": 0.0, "grad_norm": 1.3671875, "kl": 0.02199251647107303, "learning_rate": 7.1992e-06, "loss": 0.0533, "num_tokens": 92581405.0, "reward": 2.586845874786377, "reward_std": 0.8533264994621277, "rewards/reward_fn/mean": 2.586845874786377, "rewards/reward_fn/std": 0.8533264994621277, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1235.0, "completions/max_terminated_length": 1235.0, "completions/mean_length": 310.8125, "completions/mean_terminated_length": 310.8125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.21258088469290337, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.02182422927580774, "learning_rate": 7.1988e-06, "loss": -0.0255, "num_tokens": 92610615.0, "reward": 2.3631041049957275, "reward_std": 0.5505119562149048, "rewards/reward_fn/mean": 2.3631041049957275, "rewards/reward_fn/std": 0.5505119562149048, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 203.15625, "completions/mean_terminated_length": 203.15625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.21268696297867826, "frac_reward_zero_std": 1.0, "grad_norm": 0.064453125, "kl": 0.01677056518383324, "learning_rate": 7.1984e-06, "loss": 0.0007, "num_tokens": 92649212.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 271.21875, "completions/mean_terminated_length": 271.21875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.21279304126445317, "frac_reward_zero_std": 0.0, "grad_norm": 1.6953125, "kl": 0.015512895653955638, "learning_rate": 7.198e-06, "loss": -0.0671, "num_tokens": 92696067.0, "reward": 3.7533555030822754, "reward_std": 0.6800351738929749, "rewards/reward_fn/mean": 3.7533555030822754, "rewards/reward_fn/std": 0.6800351142883301, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 485.8125, "completions/mean_terminated_length": 485.8125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.21289911955022806, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.028621545527130365, "learning_rate": 7.1976e-06, "loss": 0.0727, "num_tokens": 92746141.0, "reward": 2.80375337600708, "reward_std": 0.4395216107368469, "rewards/reward_fn/mean": 2.80375337600708, "rewards/reward_fn/std": 0.43952158093452454, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 972.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 524.5, "completions/mean_terminated_length": 524.5, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.21300519783600297, "frac_reward_zero_std": 0.0, "grad_norm": 1.21875, "kl": 0.02100277761928737, "learning_rate": 7.1971999999999995e-06, "loss": -0.0492, "num_tokens": 92799437.0, "reward": 2.993116855621338, "reward_std": 0.6083241701126099, "rewards/reward_fn/mean": 2.993116855621338, "rewards/reward_fn/std": 0.6083241701126099, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 332.84375, "completions/mean_terminated_length": 332.84375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.21311127612177788, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.028305693296715617, "learning_rate": 7.1967999999999994e-06, "loss": 0.0428, "num_tokens": 92822056.0, "reward": 3.8866019248962402, "reward_std": 0.35826677083969116, "rewards/reward_fn/mean": 3.8866019248962402, "rewards/reward_fn/std": 0.35826677083969116, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 354.46875, "completions/mean_terminated_length": 354.46875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.21321735440755277, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.019336213706992567, "learning_rate": 7.196399999999999e-06, "loss": 0.0278, "num_tokens": 92879767.0, "reward": 3.6398813724517822, "reward_std": 0.5077115297317505, "rewards/reward_fn/mean": 3.6398813724517822, "rewards/reward_fn/std": 0.5077115297317505, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2033.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 612.59375, "completions/mean_terminated_length": 612.59375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.21332343269332768, "frac_reward_zero_std": 0.0, "grad_norm": 1.1796875, "kl": 0.015353482798673213, "learning_rate": 7.195999999999999e-06, "loss": -0.0099, "num_tokens": 92932458.0, "reward": 3.1320223808288574, "reward_std": 0.7826876640319824, "rewards/reward_fn/mean": 3.1320223808288574, "rewards/reward_fn/std": 0.7826876044273376, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 215.0, "completions/max_terminated_length": 215.0, "completions/mean_length": 153.5625, "completions/mean_terminated_length": 153.5625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.21342951097910257, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.021226009470410645, "learning_rate": 7.195599999999999e-06, "loss": -0.0176, "num_tokens": 92990620.0, "reward": 3.7923405170440674, "reward_std": 0.6559759974479675, "rewards/reward_fn/mean": 3.7923405170440674, "rewards/reward_fn/std": 0.6559760570526123, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1211.0, "completions/max_terminated_length": 1211.0, "completions/mean_length": 419.5, "completions/mean_terminated_length": 419.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.21353558926487748, "frac_reward_zero_std": 0.0, "grad_norm": 1.28125, "kl": 0.02267545904032886, "learning_rate": 7.195199999999999e-06, "loss": 0.1904, "num_tokens": 93033100.0, "reward": 3.1862945556640625, "reward_std": 0.47886648774147034, "rewards/reward_fn/mean": 3.1862945556640625, "rewards/reward_fn/std": 0.47886648774147034, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 165.3125, "completions/mean_terminated_length": 165.3125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.2136416675506524, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.018245216575451195, "learning_rate": 7.194799999999999e-06, "loss": -0.0072, "num_tokens": 93074614.0, "reward": 3.9680566787719727, "reward_std": 0.18069864809513092, "rewards/reward_fn/mean": 3.9680566787719727, "rewards/reward_fn/std": 0.18069863319396973, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 83.25, "completions/mean_terminated_length": 83.25, "completions/min_length": 54.0, "completions/min_terminated_length": 54.0, "epoch": 0.21374774583642728, "frac_reward_zero_std": 1.0, "grad_norm": 0.11474609375, "kl": 0.01922210631892085, "learning_rate": 7.194399999999999e-06, "loss": 0.0008, "num_tokens": 93115646.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 195.34375, "completions/mean_terminated_length": 195.34375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.2138538241222022, "frac_reward_zero_std": 1.0, "grad_norm": 0.09423828125, "kl": 0.0225376442540437, "learning_rate": 7.193999999999999e-06, "loss": 0.0009, "num_tokens": 93176233.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 311.3125, "completions/mean_terminated_length": 311.3125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.21395990240797708, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.023441240657120943, "learning_rate": 7.1936e-06, "loss": 0.1198, "num_tokens": 93224051.0, "reward": 3.871459484100342, "reward_std": 0.3460574746131897, "rewards/reward_fn/mean": 3.871459484100342, "rewards/reward_fn/std": 0.3460574448108673, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2015.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 281.15625, "completions/mean_terminated_length": 281.15625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.214065980693752, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.023096083430573344, "learning_rate": 7.1932e-06, "loss": 0.0427, "num_tokens": 93295032.0, "reward": 3.9682388305664062, "reward_std": 0.17966748774051666, "rewards/reward_fn/mean": 3.9682388305664062, "rewards/reward_fn/std": 0.17966745793819427, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 342.625, "completions/mean_terminated_length": 342.625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.2141720589795269, "frac_reward_zero_std": 1.0, "grad_norm": 0.095703125, "kl": 0.02021972427610308, "learning_rate": 7.1928e-06, "loss": 0.0008, "num_tokens": 93340396.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1085.0, "completions/mean_length": 532.6875, "completions/mean_terminated_length": 483.8064270019531, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.2142781372653018, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.019356919452548027, "learning_rate": 7.1924e-06, "loss": 0.2521, "num_tokens": 93410210.0, "reward": 2.8118836879730225, "reward_std": 0.5149768590927124, "rewards/reward_fn/mean": 2.8118836879730225, "rewards/reward_fn/std": 0.5149767994880676, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 535.75, "completions/mean_terminated_length": 535.75, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.2143842155510767, "frac_reward_zero_std": 0.0, "grad_norm": 1.4765625, "kl": 0.02323057595640421, "learning_rate": 7.192e-06, "loss": -0.1021, "num_tokens": 93456890.0, "reward": 2.6880881786346436, "reward_std": 0.6508941054344177, "rewards/reward_fn/mean": 2.6880881786346436, "rewards/reward_fn/std": 0.650894045829773, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 197.0625, "completions/mean_terminated_length": 197.0625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.2144902938368516, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.02640698431059718, "learning_rate": 7.1916e-06, "loss": 0.0011, "num_tokens": 93504572.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.2145963721226265, "frac_reward_zero_std": 0.0, "grad_norm": 2.109375, "kl": 0.023409150540828705, "learning_rate": 7.1912e-06, "loss": 0.05, "num_tokens": 93531216.0, "reward": 3.9796085357666016, "reward_std": 0.11535120010375977, "rewards/reward_fn/mean": 3.9796085357666016, "rewards/reward_fn/std": 0.11535115540027618, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1389.0, "completions/max_terminated_length": 1389.0, "completions/mean_length": 379.875, "completions/mean_terminated_length": 379.875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.2147024504084014, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.018924297066405416, "learning_rate": 7.1908e-06, "loss": 0.0261, "num_tokens": 93581708.0, "reward": 3.8608009815216064, "reward_std": 0.5477797389030457, "rewards/reward_fn/mean": 3.8608009815216064, "rewards/reward_fn/std": 0.5477797389030457, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 237.0625, "completions/mean_terminated_length": 237.0625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.2148085286941763, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.019628084031865, "learning_rate": 7.1904e-06, "loss": 0.0088, "num_tokens": 93624078.0, "reward": 3.351236581802368, "reward_std": 0.6228786110877991, "rewards/reward_fn/mean": 3.351236581802368, "rewards/reward_fn/std": 0.6228786110877991, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1163.0, "completions/max_terminated_length": 1163.0, "completions/mean_length": 382.375, "completions/mean_terminated_length": 382.375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.2149146069799512, "frac_reward_zero_std": 0.0, "grad_norm": 1.375, "kl": 0.022442698711529374, "learning_rate": 7.19e-06, "loss": 0.0868, "num_tokens": 93690490.0, "reward": 3.7622103691101074, "reward_std": 0.5334495902061462, "rewards/reward_fn/mean": 3.7622103691101074, "rewards/reward_fn/std": 0.5334495902061462, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 258.0, "completions/mean_terminated_length": 258.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.2150206852657261, "frac_reward_zero_std": 0.0, "grad_norm": 1.9765625, "kl": 0.027412381023168564, "learning_rate": 7.1896e-06, "loss": -0.0866, "num_tokens": 93729818.0, "reward": 2.7926363945007324, "reward_std": 0.02799339033663273, "rewards/reward_fn/mean": 2.7926363945007324, "rewards/reward_fn/std": 0.027993371710181236, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 241.59375, "completions/mean_terminated_length": 241.59375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.215126763551501, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.02329367445781827, "learning_rate": 7.189199999999999e-06, "loss": -0.0132, "num_tokens": 93773933.0, "reward": 3.294184923171997, "reward_std": 0.3811061680316925, "rewards/reward_fn/mean": 3.294184923171997, "rewards/reward_fn/std": 0.3811061978340149, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 280.8125, "completions/mean_terminated_length": 280.8125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.21523284183727592, "frac_reward_zero_std": 0.0, "grad_norm": 2.5, "kl": 0.030496369348838925, "learning_rate": 7.1888e-06, "loss": 0.0776, "num_tokens": 93813511.0, "reward": 3.894092082977295, "reward_std": 0.4451395869255066, "rewards/reward_fn/mean": 3.894092082977295, "rewards/reward_fn/std": 0.4451395571231842, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 210.0, "completions/mean_terminated_length": 210.0, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.2153389201230508, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.026559143094345927, "learning_rate": 7.1884e-06, "loss": 0.0011, "num_tokens": 93862471.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1131.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 317.6875, "completions/mean_terminated_length": 317.6875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.21544499840882572, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.02768648392520845, "learning_rate": 7.188e-06, "loss": -0.0168, "num_tokens": 93903325.0, "reward": 3.881580352783203, "reward_std": 0.4127897024154663, "rewards/reward_fn/mean": 3.881580352783203, "rewards/reward_fn/std": 0.4127897024154663, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 175.40625, "completions/mean_terminated_length": 175.40625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2155510766946006, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.019018501159735024, "learning_rate": 7.1876e-06, "loss": 0.0395, "num_tokens": 93927658.0, "reward": 3.96970534324646, "reward_std": 0.17137275636196136, "rewards/reward_fn/mean": 3.96970534324646, "rewards/reward_fn/std": 0.17137275636196136, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 173.28125, "completions/mean_terminated_length": 173.28125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.21565715498037552, "frac_reward_zero_std": 0.0, "grad_norm": 1.671875, "kl": 0.026485492940992117, "learning_rate": 7.1871999999999996e-06, "loss": -0.0462, "num_tokens": 93965459.0, "reward": 3.959859848022461, "reward_std": 0.227066308259964, "rewards/reward_fn/mean": 3.959859848022461, "rewards/reward_fn/std": 0.22706632316112518, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 440.5625, "completions/mean_terminated_length": 440.5625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.2157632332661504, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.020580148906446993, "learning_rate": 7.1867999999999995e-06, "loss": 0.0985, "num_tokens": 94001541.0, "reward": 3.9648282527923584, "reward_std": 0.1989613175392151, "rewards/reward_fn/mean": 3.9648282527923584, "rewards/reward_fn/std": 0.1989613175392151, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 377.375, "completions/mean_terminated_length": 377.375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.21586931155192532, "frac_reward_zero_std": 0.0, "grad_norm": 1.296875, "kl": 0.020776059944182634, "learning_rate": 7.1863999999999995e-06, "loss": -0.0073, "num_tokens": 94051185.0, "reward": 3.9320802688598633, "reward_std": 0.3842128813266754, "rewards/reward_fn/mean": 3.9320802688598633, "rewards/reward_fn/std": 0.3842128813266754, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1297.0, "completions/max_terminated_length": 1297.0, "completions/mean_length": 383.0625, "completions/mean_terminated_length": 383.0625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.21597538983770023, "frac_reward_zero_std": 0.0, "grad_norm": 1.96875, "kl": 0.03056110069155693, "learning_rate": 7.1859999999999995e-06, "loss": -0.0284, "num_tokens": 94094355.0, "reward": 2.9841325283050537, "reward_std": 0.7235788106918335, "rewards/reward_fn/mean": 2.9841325283050537, "rewards/reward_fn/std": 0.7235787510871887, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 234.84375, "completions/mean_terminated_length": 234.84375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.21608146812347512, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.02781497361138463, "learning_rate": 7.1855999999999994e-06, "loss": 0.2453, "num_tokens": 94125326.0, "reward": 2.7703404426574707, "reward_std": 0.03106667473912239, "rewards/reward_fn/mean": 2.7703404426574707, "rewards/reward_fn/std": 0.03106665052473545, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 303.0625, "completions/mean_terminated_length": 303.0625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.21618754640925003, "frac_reward_zero_std": 0.0, "grad_norm": 1.5078125, "kl": 0.020188409835100174, "learning_rate": 7.185199999999999e-06, "loss": 0.1097, "num_tokens": 94169200.0, "reward": 2.8137176036834717, "reward_std": 0.2147783488035202, "rewards/reward_fn/mean": 2.8137176036834717, "rewards/reward_fn/std": 0.21477839350700378, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1176.0, "completions/max_terminated_length": 1176.0, "completions/mean_length": 250.71875, "completions/mean_terminated_length": 250.71875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.21629362469502492, "frac_reward_zero_std": 0.0, "grad_norm": 2.65625, "kl": 0.03112406632862985, "learning_rate": 7.184799999999999e-06, "loss": 0.0002, "num_tokens": 94212295.0, "reward": 3.95988130569458, "reward_std": 0.22694644331932068, "rewards/reward_fn/mean": 3.95988130569458, "rewards/reward_fn/std": 0.22694644331932068, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 257.5, "completions/mean_terminated_length": 257.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.21639970298079983, "frac_reward_zero_std": 1.0, "grad_norm": 0.0927734375, "kl": 0.02199523849412799, "learning_rate": 7.1844e-06, "loss": 0.0009, "num_tokens": 94274263.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1370.0, "completions/max_terminated_length": 1370.0, "completions/mean_length": 451.8125, "completions/mean_terminated_length": 451.8125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.21650578126657474, "frac_reward_zero_std": 0.0, "grad_norm": 1.4375, "kl": 0.023585932329297066, "learning_rate": 7.184e-06, "loss": 0.1187, "num_tokens": 94341937.0, "reward": 2.8903353214263916, "reward_std": 0.4277820885181427, "rewards/reward_fn/mean": 2.8903353214263916, "rewards/reward_fn/std": 0.4277820587158203, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 1021.90625, "completions/mean_terminated_length": 988.806396484375, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "epoch": 0.21661185955234963, "frac_reward_zero_std": 0.0, "grad_norm": 0.84765625, "kl": 0.01376344496384263, "learning_rate": 7.1836e-06, "loss": 0.1077, "num_tokens": 94416718.0, "reward": 2.3924713134765625, "reward_std": 0.48986151814460754, "rewards/reward_fn/mean": 2.3924713134765625, "rewards/reward_fn/std": 0.48986148834228516, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 71.8125, "completions/mean_terminated_length": 71.8125, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.21671793783812454, "frac_reward_zero_std": 1.0, "grad_norm": 0.1484375, "kl": 0.018146761576645076, "learning_rate": 7.1832e-06, "loss": 0.0007, "num_tokens": 94453000.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 180.0, "completions/max_terminated_length": 180.0, "completions/mean_length": 103.5625, "completions/mean_terminated_length": 103.5625, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.21682401612389943, "frac_reward_zero_std": 1.0, "grad_norm": 0.138671875, "kl": 0.024427478667348623, "learning_rate": 7.1828e-06, "loss": 0.001, "num_tokens": 94487130.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 489.8125, "completions/mean_terminated_length": 489.8125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.21693009440967434, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.0271578470710665, "learning_rate": 7.1824e-06, "loss": -0.0102, "num_tokens": 94531700.0, "reward": 3.384610176086426, "reward_std": 0.7715237736701965, "rewards/reward_fn/mean": 3.384610176086426, "rewards/reward_fn/std": 0.7715237140655518, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1864.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 382.28125, "completions/mean_terminated_length": 382.28125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.21703617269544925, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.0231085903942585, "learning_rate": 7.182e-06, "loss": 0.1366, "num_tokens": 94579197.0, "reward": 2.9709372520446777, "reward_std": 0.45013728737831116, "rewards/reward_fn/mean": 2.9709372520446777, "rewards/reward_fn/std": 0.45013728737831116, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 186.71875, "completions/mean_terminated_length": 186.71875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.21714225098122414, "frac_reward_zero_std": 0.0, "grad_norm": 2.578125, "kl": 0.02708746073767543, "learning_rate": 7.1816e-06, "loss": 0.0161, "num_tokens": 94622996.0, "reward": 3.984982967376709, "reward_std": 0.0849492996931076, "rewards/reward_fn/mean": 3.984982967376709, "rewards/reward_fn/std": 0.08494929224252701, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1115.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 405.1875, "completions/mean_terminated_length": 405.1875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.21724832926699905, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.023749822983518243, "learning_rate": 7.181199999999999e-06, "loss": -0.0417, "num_tokens": 94664762.0, "reward": 3.4993603229522705, "reward_std": 0.578482985496521, "rewards/reward_fn/mean": 3.4993603229522705, "rewards/reward_fn/std": 0.578482985496521, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1071.0, "completions/max_terminated_length": 1071.0, "completions/mean_length": 237.0625, "completions/mean_terminated_length": 237.0625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.21735440755277394, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.021896290825679898, "learning_rate": 7.180799999999999e-06, "loss": 0.0248, "num_tokens": 94703196.0, "reward": 3.8218743801116943, "reward_std": 0.4205699861049652, "rewards/reward_fn/mean": 3.8218743801116943, "rewards/reward_fn/std": 0.4205699861049652, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 195.15625, "completions/mean_terminated_length": 195.15625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.21746048583854885, "frac_reward_zero_std": 1.0, "grad_norm": 0.09521484375, "kl": 0.018337349290959537, "learning_rate": 7.180399999999999e-06, "loss": 0.0007, "num_tokens": 94753089.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1337.0, "completions/max_terminated_length": 1337.0, "completions/mean_length": 431.84375, "completions/mean_terminated_length": 431.84375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.21756656412432376, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.02290130709297955, "learning_rate": 7.179999999999999e-06, "loss": 0.081, "num_tokens": 94804060.0, "reward": 3.5846269130706787, "reward_std": 0.518695592880249, "rewards/reward_fn/mean": 3.5846269130706787, "rewards/reward_fn/std": 0.518695592880249, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 121.0, "completions/max_terminated_length": 121.0, "completions/mean_length": 85.78125, "completions/mean_terminated_length": 85.78125, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "epoch": 0.21767264241009865, "frac_reward_zero_std": 1.0, "grad_norm": 0.10205078125, "kl": 0.017568445531651378, "learning_rate": 7.1796e-06, "loss": 0.0007, "num_tokens": 94839189.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 312.46875, "completions/mean_terminated_length": 312.46875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.21777872069587356, "frac_reward_zero_std": 1.0, "grad_norm": 0.059326171875, "kl": 0.01560823933687061, "learning_rate": 7.1792e-06, "loss": 0.0006, "num_tokens": 94891268.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 170.53125, "completions/mean_terminated_length": 170.53125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.21788479898164845, "frac_reward_zero_std": 0.0, "grad_norm": 2.46875, "kl": 0.024878250900655985, "learning_rate": 7.1788e-06, "loss": 0.0492, "num_tokens": 94935125.0, "reward": 2.791635513305664, "reward_std": 0.030824407935142517, "rewards/reward_fn/mean": 2.791635513305664, "rewards/reward_fn/std": 0.03082440234720707, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 110.625, "completions/mean_terminated_length": 110.625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.21799087726742336, "frac_reward_zero_std": 1.0, "grad_norm": 0.2431640625, "kl": 0.0337254130281508, "learning_rate": 7.1784e-06, "loss": 0.0013, "num_tokens": 94962089.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 230.375, "completions/mean_terminated_length": 230.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.21809695555319827, "frac_reward_zero_std": 1.0, "grad_norm": 0.06494140625, "kl": 0.016936297761276364, "learning_rate": 7.178e-06, "loss": 0.0007, "num_tokens": 95006325.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 189.65625, "completions/mean_terminated_length": 189.65625, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.21820303383897316, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.017722343327477574, "learning_rate": 7.1776e-06, "loss": -0.0566, "num_tokens": 95052170.0, "reward": 3.965939998626709, "reward_std": 0.19267311692237854, "rewards/reward_fn/mean": 3.965939998626709, "rewards/reward_fn/std": 0.19267308712005615, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 290.84375, "completions/mean_terminated_length": 290.84375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.21830911212474807, "frac_reward_zero_std": 0.0, "grad_norm": 1.171875, "kl": 0.021622674306854606, "learning_rate": 7.1772e-06, "loss": -0.0281, "num_tokens": 95087685.0, "reward": 3.966918468475342, "reward_std": 0.18713752925395966, "rewards/reward_fn/mean": 3.966918468475342, "rewards/reward_fn/std": 0.18713752925395966, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.21841519041052296, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.028689004946500063, "learning_rate": 7.1768e-06, "loss": -0.0264, "num_tokens": 95133725.0, "reward": 3.931462287902832, "reward_std": 0.27030280232429504, "rewards/reward_fn/mean": 3.931462287902832, "rewards/reward_fn/std": 0.27030277252197266, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1146.0, "completions/max_terminated_length": 1146.0, "completions/mean_length": 359.40625, "completions/mean_terminated_length": 359.40625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.21852126869629787, "frac_reward_zero_std": 0.0, "grad_norm": 1.5546875, "kl": 0.02633265615440905, "learning_rate": 7.1764e-06, "loss": -0.0755, "num_tokens": 95182442.0, "reward": 2.791104793548584, "reward_std": 0.3357614576816559, "rewards/reward_fn/mean": 2.791104793548584, "rewards/reward_fn/std": 0.3357614576816559, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 236.09375, "completions/mean_terminated_length": 236.09375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.21862734698207276, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.022103465860709548, "learning_rate": 7.1759999999999996e-06, "loss": 0.0616, "num_tokens": 95224621.0, "reward": 3.220750331878662, "reward_std": 0.6474551558494568, "rewards/reward_fn/mean": 3.220750331878662, "rewards/reward_fn/std": 0.6474552154541016, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 473.78125, "completions/mean_terminated_length": 423.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.21873342526784767, "frac_reward_zero_std": 0.0, "grad_norm": 1.703125, "kl": 0.02473957440815866, "learning_rate": 7.1755999999999995e-06, "loss": 0.1878, "num_tokens": 95274918.0, "reward": 2.2360482215881348, "reward_std": 0.6540963649749756, "rewards/reward_fn/mean": 2.2360482215881348, "rewards/reward_fn/std": 0.6540964245796204, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1755.0, "completions/mean_length": 770.78125, "completions/mean_terminated_length": 685.6333618164062, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.21883950355362258, "frac_reward_zero_std": 0.0, "grad_norm": 0.953125, "kl": 0.014348611701279879, "learning_rate": 7.1751999999999995e-06, "loss": 0.0931, "num_tokens": 95336959.0, "reward": 2.4014182090759277, "reward_std": 0.6035107970237732, "rewards/reward_fn/mean": 2.4014182090759277, "rewards/reward_fn/std": 0.6035107970237732, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 306.75, "completions/mean_terminated_length": 306.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.21894558183939747, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.02311077411286533, "learning_rate": 7.1748e-06, "loss": -0.0083, "num_tokens": 95382743.0, "reward": 3.9660825729370117, "reward_std": 0.1918664574623108, "rewards/reward_fn/mean": 3.9660825729370117, "rewards/reward_fn/std": 0.1918664425611496, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 793.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 231.59375, "completions/mean_terminated_length": 231.59375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.21905166012517238, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.022033887798897922, "learning_rate": 7.1744e-06, "loss": -0.0387, "num_tokens": 95426218.0, "reward": 3.723904609680176, "reward_std": 0.6886054873466492, "rewards/reward_fn/mean": 3.723904609680176, "rewards/reward_fn/std": 0.6886054277420044, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 185.28125, "completions/mean_terminated_length": 185.28125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.21915773841094727, "frac_reward_zero_std": 0.0, "grad_norm": 2.234375, "kl": 0.01918771117925644, "learning_rate": 7.174e-06, "loss": 0.0791, "num_tokens": 95467795.0, "reward": 3.837489128112793, "reward_std": 0.3440697491168976, "rewards/reward_fn/mean": 3.837489128112793, "rewards/reward_fn/std": 0.34406977891921997, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1059.0, "completions/max_terminated_length": 1059.0, "completions/mean_length": 264.125, "completions/mean_terminated_length": 264.125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.21926381669672218, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.0323160660918802, "learning_rate": 7.1736e-06, "loss": 0.1329, "num_tokens": 95510327.0, "reward": 3.0511176586151123, "reward_std": 0.36487215757369995, "rewards/reward_fn/mean": 3.0511176586151123, "rewards/reward_fn/std": 0.36487212777137756, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1123.0, "completions/mean_length": 561.3125, "completions/mean_terminated_length": 513.3547973632812, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.2193698949824971, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.017476799665018916, "learning_rate": 7.173199999999999e-06, "loss": 0.2184, "num_tokens": 95564385.0, "reward": 2.6805789470672607, "reward_std": 0.25595468282699585, "rewards/reward_fn/mean": 2.6805789470672607, "rewards/reward_fn/std": 0.25595468282699585, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 273.28125, "completions/mean_terminated_length": 273.28125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.21947597326827198, "frac_reward_zero_std": 1.0, "grad_norm": 0.125, "kl": 0.03495682845823467, "learning_rate": 7.172799999999999e-06, "loss": 0.0014, "num_tokens": 95619082.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 180.8125, "completions/mean_terminated_length": 180.8125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.2195820515540469, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.016302260337397456, "learning_rate": 7.172399999999999e-06, "loss": -0.0412, "num_tokens": 95660580.0, "reward": 3.9653358459472656, "reward_std": 0.19608987867832184, "rewards/reward_fn/mean": 3.9653358459472656, "rewards/reward_fn/std": 0.19608986377716064, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 289.3125, "completions/mean_terminated_length": 289.3125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.21968812983982178, "frac_reward_zero_std": 1.0, "grad_norm": 0.06982421875, "kl": 0.02007239032536745, "learning_rate": 7.171999999999999e-06, "loss": 0.0008, "num_tokens": 95722414.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 212.625, "completions/mean_terminated_length": 212.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.2197942081255967, "frac_reward_zero_std": 0.0, "grad_norm": 2.203125, "kl": 0.027552237967029214, "learning_rate": 7.171599999999999e-06, "loss": -0.0198, "num_tokens": 95752738.0, "reward": 2.9689650535583496, "reward_std": 0.0429152250289917, "rewards/reward_fn/mean": 2.9689650535583496, "rewards/reward_fn/std": 0.04291524365544319, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 167.78125, "completions/mean_terminated_length": 167.78125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.2199002864113716, "frac_reward_zero_std": 1.0, "grad_norm": 0.171875, "kl": 0.022559367353096604, "learning_rate": 7.171199999999999e-06, "loss": 0.0009, "num_tokens": 95815675.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 212.5625, "completions/mean_terminated_length": 212.5625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.2200063646971465, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.010360506421420723, "learning_rate": 7.170799999999999e-06, "loss": -0.0001, "num_tokens": 95860493.0, "reward": 3.929622173309326, "reward_std": 0.3981178402900696, "rewards/reward_fn/mean": 3.929622173309326, "rewards/reward_fn/std": 0.3981178402900696, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 206.3125, "completions/mean_terminated_length": 206.3125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.2201124429829214, "frac_reward_zero_std": 1.0, "grad_norm": 0.0712890625, "kl": 0.01717807969544083, "learning_rate": 7.170399999999999e-06, "loss": 0.0007, "num_tokens": 95918807.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 389.65625, "completions/mean_terminated_length": 389.65625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.2202185212686963, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.02313116961158812, "learning_rate": 7.17e-06, "loss": 0.0968, "num_tokens": 95967660.0, "reward": 3.746159553527832, "reward_std": 0.6335774660110474, "rewards/reward_fn/mean": 3.746159553527832, "rewards/reward_fn/std": 0.6335774064064026, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 192.6875, "completions/mean_terminated_length": 192.6875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.2203245995544712, "frac_reward_zero_std": 0.0, "grad_norm": 2.09375, "kl": 0.029190136585384607, "learning_rate": 7.1696e-06, "loss": 0.0691, "num_tokens": 96007010.0, "reward": 3.521986484527588, "reward_std": 0.6271064877510071, "rewards/reward_fn/mean": 3.521986484527588, "rewards/reward_fn/std": 0.6271064877510071, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 256.9375, "completions/mean_terminated_length": 256.9375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.22043067784024611, "frac_reward_zero_std": 0.0, "grad_norm": 1.765625, "kl": 0.031511508859694004, "learning_rate": 7.1692e-06, "loss": -0.0353, "num_tokens": 96050592.0, "reward": 3.8938238620758057, "reward_std": 0.43997421860694885, "rewards/reward_fn/mean": 3.8938238620758057, "rewards/reward_fn/std": 0.43997427821159363, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 457.96875, "completions/mean_terminated_length": 457.96875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.220536756126021, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.01874492526985705, "learning_rate": 7.1688e-06, "loss": 0.0453, "num_tokens": 96117151.0, "reward": 3.6167984008789062, "reward_std": 0.5801927447319031, "rewards/reward_fn/mean": 3.6167984008789062, "rewards/reward_fn/std": 0.5801927447319031, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.2206428344117959, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.015837123501114547, "learning_rate": 7.1684e-06, "loss": 0.0119, "num_tokens": 96158739.0, "reward": 3.959514617919922, "reward_std": 0.22902005910873413, "rewards/reward_fn/mean": 3.959514617919922, "rewards/reward_fn/std": 0.22902007400989532, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 333.1875, "completions/mean_terminated_length": 333.1875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.2207489126975708, "frac_reward_zero_std": 1.0, "grad_norm": 0.07861328125, "kl": 0.020457167527638376, "learning_rate": 7.168e-06, "loss": 0.0008, "num_tokens": 96208665.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 225.65625, "completions/mean_terminated_length": 225.65625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.2208549909833457, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.024195005418732762, "learning_rate": 7.1676e-06, "loss": -0.1019, "num_tokens": 96244942.0, "reward": 3.6277449131011963, "reward_std": 0.4575171172618866, "rewards/reward_fn/mean": 3.6277449131011963, "rewards/reward_fn/std": 0.4575170576572418, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 251.9375, "completions/mean_terminated_length": 251.9375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.22096106926912062, "frac_reward_zero_std": 0.0, "grad_norm": 1.8359375, "kl": 0.02725498448126018, "learning_rate": 7.1672e-06, "loss": 0.0831, "num_tokens": 96268908.0, "reward": 3.8278117179870605, "reward_std": 0.5053116083145142, "rewards/reward_fn/mean": 3.8278117179870605, "rewards/reward_fn/std": 0.5053115487098694, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 206.4375, "completions/mean_terminated_length": 206.4375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.2210671475548955, "frac_reward_zero_std": 0.0, "grad_norm": 1.6328125, "kl": 0.01674318127334118, "learning_rate": 7.1668e-06, "loss": -0.03, "num_tokens": 96305498.0, "reward": 2.9250006675720215, "reward_std": 0.04684029147028923, "rewards/reward_fn/mean": 2.9250006675720215, "rewards/reward_fn/std": 0.04684024676680565, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 122.78125, "completions/mean_terminated_length": 122.78125, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.22117322584067042, "frac_reward_zero_std": 1.0, "grad_norm": 0.10888671875, "kl": 0.026960970601066947, "learning_rate": 7.1664e-06, "loss": 0.0011, "num_tokens": 96357139.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 359.6875, "completions/mean_terminated_length": 359.6875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.2212793041264453, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.021832972299307585, "learning_rate": 7.166e-06, "loss": 0.0165, "num_tokens": 96402345.0, "reward": 2.866878032684326, "reward_std": 0.3685334324836731, "rewards/reward_fn/mean": 2.866878032684326, "rewards/reward_fn/std": 0.3685334324836731, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 120.90625, "completions/mean_terminated_length": 120.90625, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.22138538241222022, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.022863602731376886, "learning_rate": 7.1656000000000005e-06, "loss": 0.1065, "num_tokens": 96439302.0, "reward": 2.8354334831237793, "reward_std": 0.03567254915833473, "rewards/reward_fn/mean": 2.8354334831237793, "rewards/reward_fn/std": 0.035672519356012344, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 245.25, "completions/mean_terminated_length": 245.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.2214914606979951, "frac_reward_zero_std": 0.0, "grad_norm": 2.171875, "kl": 0.02509408933110535, "learning_rate": 7.1652e-06, "loss": 0.093, "num_tokens": 96492078.0, "reward": 3.8152360916137695, "reward_std": 0.3649226725101471, "rewards/reward_fn/mean": 3.8152360916137695, "rewards/reward_fn/std": 0.3649226725101471, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 195.0, "completions/mean_terminated_length": 195.0, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.22159753898377002, "frac_reward_zero_std": 0.0, "grad_norm": 2.078125, "kl": 0.020710675860755146, "learning_rate": 7.1647999999999996e-06, "loss": -0.0253, "num_tokens": 96516622.0, "reward": 3.013934850692749, "reward_std": 0.325973778963089, "rewards/reward_fn/mean": 3.013934850692749, "rewards/reward_fn/std": 0.325973778963089, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 322.78125, "completions/mean_terminated_length": 322.78125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.22170361726954493, "frac_reward_zero_std": 0.0, "grad_norm": 2.34375, "kl": 0.01796165155246854, "learning_rate": 7.1643999999999995e-06, "loss": 0.1965, "num_tokens": 96562535.0, "reward": 3.9826741218566895, "reward_std": 0.0980101004242897, "rewards/reward_fn/mean": 3.9826741218566895, "rewards/reward_fn/std": 0.0980100929737091, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 142.25, "completions/mean_terminated_length": 142.25, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.22180969555531982, "frac_reward_zero_std": 0.0, "grad_norm": 2.609375, "kl": 0.0245153047144413, "learning_rate": 7.1639999999999995e-06, "loss": 0.0613, "num_tokens": 96599823.0, "reward": 3.8469858169555664, "reward_std": 0.41139575839042664, "rewards/reward_fn/mean": 3.8469858169555664, "rewards/reward_fn/std": 0.4113958179950714, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 126.0625, "completions/mean_terminated_length": 126.0625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.22191577384109473, "frac_reward_zero_std": 0.0, "grad_norm": 2.84375, "kl": 0.030944793485105038, "learning_rate": 7.1635999999999995e-06, "loss": 0.1047, "num_tokens": 96639345.0, "reward": 2.86065936088562, "reward_std": 0.05229390040040016, "rewards/reward_fn/mean": 2.86065936088562, "rewards/reward_fn/std": 0.05229390785098076, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 242.9375, "completions/mean_terminated_length": 242.9375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.22202185212686962, "frac_reward_zero_std": 0.0, "grad_norm": 2.28125, "kl": 0.024946002522483468, "learning_rate": 7.1631999999999995e-06, "loss": 0.0677, "num_tokens": 96661999.0, "reward": 3.1001062393188477, "reward_std": 0.3169058561325073, "rewards/reward_fn/mean": 3.1001062393188477, "rewards/reward_fn/std": 0.31690582633018494, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 145.46875, "completions/mean_terminated_length": 145.46875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.22212793041264453, "frac_reward_zero_std": 1.0, "grad_norm": 0.1416015625, "kl": 0.03213479835540056, "learning_rate": 7.162799999999999e-06, "loss": 0.0013, "num_tokens": 96683358.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 239.59375, "completions/mean_terminated_length": 239.59375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.22223400869841944, "frac_reward_zero_std": 0.0, "grad_norm": 1.90625, "kl": 0.014876898960210383, "learning_rate": 7.162399999999999e-06, "loss": 0.0037, "num_tokens": 96730929.0, "reward": 3.8969430923461914, "reward_std": 0.4309411346912384, "rewards/reward_fn/mean": 3.8969430923461914, "rewards/reward_fn/std": 0.430941104888916, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1064.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 305.5625, "completions/mean_terminated_length": 305.5625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.22234008698419433, "frac_reward_zero_std": 0.0, "grad_norm": 1.59375, "kl": 0.01991373603232205, "learning_rate": 7.161999999999999e-06, "loss": 0.0078, "num_tokens": 96776515.0, "reward": 3.0914957523345947, "reward_std": 0.5341657996177673, "rewards/reward_fn/mean": 3.0914957523345947, "rewards/reward_fn/std": 0.5341657996177673, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1050.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 309.96875, "completions/mean_terminated_length": 309.96875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.22244616526996924, "frac_reward_zero_std": 0.0, "grad_norm": 2.21875, "kl": 0.022031590808182955, "learning_rate": 7.161599999999999e-06, "loss": 0.0249, "num_tokens": 96829954.0, "reward": 3.9301846027374268, "reward_std": 0.3949355185031891, "rewards/reward_fn/mean": 3.9301846027374268, "rewards/reward_fn/std": 0.3949355185031891, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1887.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 511.59375, "completions/mean_terminated_length": 511.59375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.22255224355574413, "frac_reward_zero_std": 0.0, "grad_norm": 1.390625, "kl": 0.023883348098024726, "learning_rate": 7.161199999999999e-06, "loss": 0.0523, "num_tokens": 96860949.0, "reward": 2.677259922027588, "reward_std": 0.08000855147838593, "rewards/reward_fn/mean": 2.677259922027588, "rewards/reward_fn/std": 0.08000854402780533, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1283.0, "completions/max_terminated_length": 1283.0, "completions/mean_length": 352.71875, "completions/mean_terminated_length": 352.71875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.22265832184151904, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.023541218601167202, "learning_rate": 7.1608e-06, "loss": -0.0193, "num_tokens": 96929548.0, "reward": 3.930210590362549, "reward_std": 0.27486422657966614, "rewards/reward_fn/mean": 3.930210590362549, "rewards/reward_fn/std": 0.27486422657966614, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 254.5625, "completions/mean_terminated_length": 254.5625, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.22276440012729395, "frac_reward_zero_std": 0.0, "grad_norm": 1.921875, "kl": 0.028475441271439195, "learning_rate": 7.1604e-06, "loss": -0.0486, "num_tokens": 96967934.0, "reward": 3.4316561222076416, "reward_std": 0.6160090565681458, "rewards/reward_fn/mean": 3.4316561222076416, "rewards/reward_fn/std": 0.6160091161727905, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 158.09375, "completions/mean_terminated_length": 158.09375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.22287047841306884, "frac_reward_zero_std": 1.0, "grad_norm": 0.0869140625, "kl": 0.01883817312773317, "learning_rate": 7.16e-06, "loss": 0.0008, "num_tokens": 97000769.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 894.0, "completions/max_terminated_length": 894.0, "completions/mean_length": 325.4375, "completions/mean_terminated_length": 325.4375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.22297655669884375, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.02302585169672966, "learning_rate": 7.1596e-06, "loss": 0.0672, "num_tokens": 97046383.0, "reward": 3.923642158508301, "reward_std": 0.3004699647426605, "rewards/reward_fn/mean": 3.923642158508301, "rewards/reward_fn/std": 0.3004699647426605, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1747.0, "completions/max_terminated_length": 1747.0, "completions/mean_length": 518.75, "completions/mean_terminated_length": 518.75, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.22308263498461864, "frac_reward_zero_std": 0.0, "grad_norm": 1.2109375, "kl": 0.020775122102349997, "learning_rate": 7.1592e-06, "loss": -0.0671, "num_tokens": 97085383.0, "reward": 3.2577528953552246, "reward_std": 0.5909585356712341, "rewards/reward_fn/mean": 3.2577528953552246, "rewards/reward_fn/std": 0.5909585356712341, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 133.34375, "completions/mean_terminated_length": 133.34375, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.22318871327039355, "frac_reward_zero_std": 1.0, "grad_norm": 0.11328125, "kl": 0.021738857380114496, "learning_rate": 7.1588e-06, "loss": 0.0009, "num_tokens": 97141842.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 170.125, "completions/mean_terminated_length": 170.125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.22329479155616846, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.033207230269908905, "learning_rate": 7.1584e-06, "loss": -0.0047, "num_tokens": 97176086.0, "reward": 2.8362083435058594, "reward_std": 0.036612384021282196, "rewards/reward_fn/mean": 2.8362083435058594, "rewards/reward_fn/std": 0.03661240264773369, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1838.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 596.125, "completions/mean_terminated_length": 596.125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.22340086984194335, "frac_reward_zero_std": 0.0, "grad_norm": 1.1875, "kl": 0.018431765493005514, "learning_rate": 7.158e-06, "loss": 0.1009, "num_tokens": 97241338.0, "reward": 2.9607906341552734, "reward_std": 0.35934340953826904, "rewards/reward_fn/mean": 2.9607906341552734, "rewards/reward_fn/std": 0.35934343934059143, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 785.53125, "completions/mean_terminated_length": 744.8064575195312, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.22350694812771826, "frac_reward_zero_std": 0.0, "grad_norm": 1.125, "kl": 0.01531498518306762, "learning_rate": 7.1576e-06, "loss": 0.1756, "num_tokens": 97316107.0, "reward": 3.421172618865967, "reward_std": 0.9520739316940308, "rewards/reward_fn/mean": 3.421172618865967, "rewards/reward_fn/std": 0.9520739316940308, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 183.09375, "completions/mean_terminated_length": 183.09375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.22361302641349315, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349609375, "kl": 0.019659267854876816, "learning_rate": 7.157199999999999e-06, "loss": 0.0008, "num_tokens": 97373774.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1686.0, "completions/max_terminated_length": 1686.0, "completions/mean_length": 378.09375, "completions/mean_terminated_length": 378.09375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.22371910469926806, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.02354878233745694, "learning_rate": 7.156799999999999e-06, "loss": -0.1435, "num_tokens": 97416305.0, "reward": 3.2668161392211914, "reward_std": 0.6438294053077698, "rewards/reward_fn/mean": 3.2668161392211914, "rewards/reward_fn/std": 0.6438294053077698, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1249.0, "completions/max_terminated_length": 1249.0, "completions/mean_length": 371.375, "completions/mean_terminated_length": 371.375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.22382518298504298, "frac_reward_zero_std": 0.0, "grad_norm": 0.8359375, "kl": 0.023293010191991925, "learning_rate": 7.156399999999999e-06, "loss": -0.08, "num_tokens": 97471517.0, "reward": 3.9675302505493164, "reward_std": 0.18367597460746765, "rewards/reward_fn/mean": 3.9675302505493164, "rewards/reward_fn/std": 0.18367597460746765, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1040.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 250.875, "completions/mean_terminated_length": 250.875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.22393126127081786, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.0269605559296906, "learning_rate": 7.156e-06, "loss": 0.0463, "num_tokens": 97513113.0, "reward": 3.773740291595459, "reward_std": 0.4349633753299713, "rewards/reward_fn/mean": 3.773740291595459, "rewards/reward_fn/std": 0.4349633753299713, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1635.0, "completions/max_terminated_length": 1635.0, "completions/mean_length": 382.4375, "completions/mean_terminated_length": 382.4375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.22403733955659277, "frac_reward_zero_std": 0.0, "grad_norm": 1.546875, "kl": 0.030054514296352863, "learning_rate": 7.1556e-06, "loss": 0.092, "num_tokens": 97554151.0, "reward": 3.541625499725342, "reward_std": 0.634077250957489, "rewards/reward_fn/mean": 3.541625499725342, "rewards/reward_fn/std": 0.634077250957489, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 433.3125, "completions/mean_terminated_length": 381.2257995605469, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.22414341784236766, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.019980568438768387, "learning_rate": 7.1552e-06, "loss": 0.2043, "num_tokens": 97603313.0, "reward": 2.8600192070007324, "reward_std": 0.5243455171585083, "rewards/reward_fn/mean": 2.8600192070007324, "rewards/reward_fn/std": 0.5243453979492188, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1545.0, "completions/max_terminated_length": 1545.0, "completions/mean_length": 422.5, "completions/mean_terminated_length": 422.5, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.22424949612814257, "frac_reward_zero_std": 0.0, "grad_norm": 1.53125, "kl": 0.023464223137125373, "learning_rate": 7.1548e-06, "loss": -0.0462, "num_tokens": 97652833.0, "reward": 2.905588150024414, "reward_std": 0.43429967761039734, "rewards/reward_fn/mean": 2.905588150024414, "rewards/reward_fn/std": 0.43429967761039734, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 110.65625, "completions/mean_terminated_length": 110.65625, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.22435557441391746, "frac_reward_zero_std": 1.0, "grad_norm": 0.07421875, "kl": 0.016824107500724494, "learning_rate": 7.1544e-06, "loss": 0.0007, "num_tokens": 97702038.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 490.8125, "completions/mean_terminated_length": 490.8125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.22446165269969237, "frac_reward_zero_std": 0.0, "grad_norm": 1.203125, "kl": 0.019860751694068313, "learning_rate": 7.154e-06, "loss": 0.1159, "num_tokens": 97761872.0, "reward": 2.950417995452881, "reward_std": 0.4983111023902893, "rewards/reward_fn/mean": 2.950417995452881, "rewards/reward_fn/std": 0.4983111023902893, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1101.0, "completions/max_terminated_length": 1101.0, "completions/mean_length": 337.09375, "completions/mean_terminated_length": 337.09375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.22456773098546728, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.023477735929191113, "learning_rate": 7.1535999999999996e-06, "loss": -0.0293, "num_tokens": 97814899.0, "reward": 2.9934606552124023, "reward_std": 0.7055239677429199, "rewards/reward_fn/mean": 2.9934606552124023, "rewards/reward_fn/std": 0.7055239081382751, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 79.625, "completions/mean_terminated_length": 79.625, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "epoch": 0.22467380927124217, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.016976277576759458, "learning_rate": 7.1531999999999995e-06, "loss": 0.0007, "num_tokens": 97852359.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 207.28125, "completions/mean_terminated_length": 207.28125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.22477988755701708, "frac_reward_zero_std": 1.0, "grad_norm": 0.10400390625, "kl": 0.024817738914862275, "learning_rate": 7.1527999999999995e-06, "loss": 0.001, "num_tokens": 97893552.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 339.375, "completions/mean_terminated_length": 339.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.22488596584279197, "frac_reward_zero_std": 0.0, "grad_norm": 1.484375, "kl": 0.02061704732477665, "learning_rate": 7.1523999999999995e-06, "loss": 0.0577, "num_tokens": 97941532.0, "reward": 3.6477723121643066, "reward_std": 0.603849470615387, "rewards/reward_fn/mean": 3.6477723121643066, "rewards/reward_fn/std": 0.603849470615387, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1074.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 389.875, "completions/mean_terminated_length": 389.875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.22499204412856688, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.016808875952847302, "learning_rate": 7.1519999999999995e-06, "loss": 0.0569, "num_tokens": 97989560.0, "reward": 3.6947927474975586, "reward_std": 0.5374994874000549, "rewards/reward_fn/mean": 3.6947927474975586, "rewards/reward_fn/std": 0.5374994277954102, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1265.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 301.03125, "completions/mean_terminated_length": 301.03125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.2250981224143418, "frac_reward_zero_std": 0.0, "grad_norm": 1.6484375, "kl": 0.017114053131081164, "learning_rate": 7.1516e-06, "loss": -0.0851, "num_tokens": 98029145.0, "reward": 3.0186710357666016, "reward_std": 0.19123917818069458, "rewards/reward_fn/mean": 3.0186710357666016, "rewards/reward_fn/std": 0.1912391483783722, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1113.0, "completions/max_terminated_length": 1113.0, "completions/mean_length": 243.0, "completions/mean_terminated_length": 243.0, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.22520420070011668, "frac_reward_zero_std": 0.0, "grad_norm": 2.6875, "kl": 0.02709133573807776, "learning_rate": 7.1512e-06, "loss": 0.06, "num_tokens": 98067161.0, "reward": 3.966747760772705, "reward_std": 0.18810324370861053, "rewards/reward_fn/mean": 3.966747760772705, "rewards/reward_fn/std": 0.18810328841209412, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 588.46875, "completions/mean_terminated_length": 588.46875, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.2253102789858916, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.02302650804631412, "learning_rate": 7.1508e-06, "loss": 0.1045, "num_tokens": 98148168.0, "reward": 2.512251853942871, "reward_std": 0.5429190397262573, "rewards/reward_fn/mean": 2.512251853942871, "rewards/reward_fn/std": 0.5429189801216125, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 286.03125, "completions/mean_terminated_length": 286.03125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.22541635727166648, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.024617045186460018, "learning_rate": 7.1504e-06, "loss": 0.1288, "num_tokens": 98196681.0, "reward": 3.893162727355957, "reward_std": 0.3376566171646118, "rewards/reward_fn/mean": 3.893162727355957, "rewards/reward_fn/std": 0.3376566171646118, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 219.75, "completions/mean_terminated_length": 219.75, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.2255224355574414, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.0329362649936229, "learning_rate": 7.15e-06, "loss": 0.0216, "num_tokens": 98247457.0, "reward": 2.987175703048706, "reward_std": 0.47789204120635986, "rewards/reward_fn/mean": 2.987175703048706, "rewards/reward_fn/std": 0.47789207100868225, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 213.40625, "completions/mean_terminated_length": 213.40625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.2256285138432163, "frac_reward_zero_std": 1.0, "grad_norm": 0.057373046875, "kl": 0.011680109717417508, "learning_rate": 7.1496e-06, "loss": 0.0005, "num_tokens": 98285070.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 163.03125, "completions/mean_terminated_length": 163.03125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.2257345921289912, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.015980440541170537, "learning_rate": 7.1492e-06, "loss": 0.0006, "num_tokens": 98311599.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1539.0, "completions/max_terminated_length": 1539.0, "completions/mean_length": 355.4375, "completions/mean_terminated_length": 355.4375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2258406704147661, "frac_reward_zero_std": 0.0, "grad_norm": 1.8515625, "kl": 0.023469509556889534, "learning_rate": 7.148799999999999e-06, "loss": -0.051, "num_tokens": 98361501.0, "reward": 3.1205215454101562, "reward_std": 0.9047970175743103, "rewards/reward_fn/mean": 3.1205215454101562, "rewards/reward_fn/std": 0.9047970771789551, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1518.0, "completions/max_terminated_length": 1518.0, "completions/mean_length": 397.71875, "completions/mean_terminated_length": 397.71875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.225946748700541, "frac_reward_zero_std": 0.0, "grad_norm": 1.4140625, "kl": 0.019150591921061277, "learning_rate": 7.148399999999999e-06, "loss": -0.0415, "num_tokens": 98412372.0, "reward": 2.7703003883361816, "reward_std": 0.2596660852432251, "rewards/reward_fn/mean": 2.7703003883361816, "rewards/reward_fn/std": 0.2596660554409027, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1542.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 377.90625, "completions/mean_terminated_length": 377.90625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.2260528269863159, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.022556321462616324, "learning_rate": 7.147999999999999e-06, "loss": 0.0785, "num_tokens": 98456241.0, "reward": 3.0584633350372314, "reward_std": 0.5075932145118713, "rewards/reward_fn/mean": 3.0584633350372314, "rewards/reward_fn/std": 0.5075931549072266, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 190.125, "completions/mean_terminated_length": 190.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.22615890527209082, "frac_reward_zero_std": 0.0, "grad_norm": 3.390625, "kl": 0.01705359760671854, "learning_rate": 7.147599999999999e-06, "loss": 0.1622, "num_tokens": 98508533.0, "reward": 3.8904531002044678, "reward_std": 0.34638121724128723, "rewards/reward_fn/mean": 3.8904531002044678, "rewards/reward_fn/std": 0.34638121724128723, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 332.03125, "completions/mean_terminated_length": 332.03125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.2262649835578657, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.023051244905218482, "learning_rate": 7.147199999999999e-06, "loss": 0.0897, "num_tokens": 98553238.0, "reward": 3.1788926124572754, "reward_std": 0.6480764746665955, "rewards/reward_fn/mean": 3.1788926124572754, "rewards/reward_fn/std": 0.6480764746665955, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 230.8125, "completions/mean_terminated_length": 230.8125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.22637106184364061, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.01907302311155945, "learning_rate": 7.1468e-06, "loss": -0.031, "num_tokens": 98602416.0, "reward": 3.8189072608947754, "reward_std": 0.5166525840759277, "rewards/reward_fn/mean": 3.8189072608947754, "rewards/reward_fn/std": 0.516652524471283, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2044.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 471.53125, "completions/mean_terminated_length": 471.53125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.2264771401294155, "frac_reward_zero_std": 0.0, "grad_norm": 1.3359375, "kl": 0.016809441964142025, "learning_rate": 7.1464e-06, "loss": -0.1404, "num_tokens": 98653633.0, "reward": 3.647747039794922, "reward_std": 0.5732264518737793, "rewards/reward_fn/mean": 3.647747039794922, "rewards/reward_fn/std": 0.5732264518737793, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 260.28125, "completions/mean_terminated_length": 260.28125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.2265832184151904, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.026241691317409277, "learning_rate": 7.146e-06, "loss": 0.0298, "num_tokens": 98678858.0, "reward": 3.641641616821289, "reward_std": 0.47242069244384766, "rewards/reward_fn/mean": 3.641641616821289, "rewards/reward_fn/std": 0.47242069244384766, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1598.0, "completions/mean_length": 695.40625, "completions/mean_terminated_length": 605.2333374023438, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.22668929670096533, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.018082533963024616, "learning_rate": 7.1456e-06, "loss": 0.1919, "num_tokens": 98741495.0, "reward": 2.9328463077545166, "reward_std": 0.5077850222587585, "rewards/reward_fn/mean": 2.9328463077545166, "rewards/reward_fn/std": 0.5077849626541138, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1250.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 345.21875, "completions/mean_terminated_length": 345.21875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.2267953749867402, "frac_reward_zero_std": 0.0, "grad_norm": 1.453125, "kl": 0.020768756279721856, "learning_rate": 7.1452e-06, "loss": 0.1444, "num_tokens": 98782814.0, "reward": 2.7692275047302246, "reward_std": 0.04734927415847778, "rewards/reward_fn/mean": 2.7692275047302246, "rewards/reward_fn/std": 0.047349270433187485, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 259.0, "completions/mean_terminated_length": 259.0, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.22690145327251512, "frac_reward_zero_std": 0.0, "grad_norm": 1.7734375, "kl": 0.025864448165521026, "learning_rate": 7.1448e-06, "loss": 0.1333, "num_tokens": 98819966.0, "reward": 2.8633437156677246, "reward_std": 0.04574419930577278, "rewards/reward_fn/mean": 2.8633437156677246, "rewards/reward_fn/std": 0.04574418067932129, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 208.0, "completions/max_terminated_length": 208.0, "completions/mean_length": 105.78125, "completions/mean_terminated_length": 105.78125, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.22700753155829, "frac_reward_zero_std": 1.0, "grad_norm": 0.09912109375, "kl": 0.018404418835416436, "learning_rate": 7.1444e-06, "loss": 0.0007, "num_tokens": 98846903.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 285.21875, "completions/mean_terminated_length": 285.21875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.22711360984406492, "frac_reward_zero_std": 0.0, "grad_norm": 1.828125, "kl": 0.017287316382862628, "learning_rate": 7.144e-06, "loss": -0.0023, "num_tokens": 98902142.0, "reward": 2.7961578369140625, "reward_std": 0.4452827572822571, "rewards/reward_fn/mean": 2.7961578369140625, "rewards/reward_fn/std": 0.4452826976776123, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 233.21875, "completions/mean_terminated_length": 233.21875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.2272196881298398, "frac_reward_zero_std": 0.0, "grad_norm": 2.515625, "kl": 0.025570286670699716, "learning_rate": 7.1436e-06, "loss": 0.2669, "num_tokens": 98946053.0, "reward": 3.322685956954956, "reward_std": 0.10418742150068283, "rewards/reward_fn/mean": 3.322685956954956, "rewards/reward_fn/std": 0.10418742895126343, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1006.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 340.65625, "completions/mean_terminated_length": 340.65625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.22732576641561472, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.01869728649035096, "learning_rate": 7.1432e-06, "loss": 0.0007, "num_tokens": 98999642.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1734.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 447.5, "completions/mean_terminated_length": 447.5, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.22743184470138963, "frac_reward_zero_std": 0.0, "grad_norm": 1.4296875, "kl": 0.020166749833151698, "learning_rate": 7.1428e-06, "loss": 0.0329, "num_tokens": 99053162.0, "reward": 2.868180274963379, "reward_std": 0.04937407374382019, "rewards/reward_fn/mean": 2.868180274963379, "rewards/reward_fn/std": 0.049374066293239594, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 207.625, "completions/mean_terminated_length": 207.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.22753792298716452, "frac_reward_zero_std": 1.0, "grad_norm": 0.07080078125, "kl": 0.016309529077261686, "learning_rate": 7.1424e-06, "loss": 0.0007, "num_tokens": 99096958.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 257.5625, "completions/mean_terminated_length": 257.5625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.22764400127293943, "frac_reward_zero_std": 0.0, "grad_norm": 1.796875, "kl": 0.02131958887912333, "learning_rate": 7.142e-06, "loss": -0.0234, "num_tokens": 99143504.0, "reward": 3.061922788619995, "reward_std": 0.41329920291900635, "rewards/reward_fn/mean": 3.061922788619995, "rewards/reward_fn/std": 0.41329917311668396, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 179.21875, "completions/mean_terminated_length": 179.21875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.22775007955871432, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.03266157838515937, "learning_rate": 7.1416e-06, "loss": 0.1967, "num_tokens": 99180919.0, "reward": 2.9364829063415527, "reward_std": 0.05752362683415413, "rewards/reward_fn/mean": 2.9364829063415527, "rewards/reward_fn/std": 0.05752362310886383, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 163.40625, "completions/mean_terminated_length": 163.40625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.22785615784448923, "frac_reward_zero_std": 0.0, "grad_norm": 2.734375, "kl": 0.02107630728278309, "learning_rate": 7.1412e-06, "loss": 0.0386, "num_tokens": 99222948.0, "reward": 3.933260440826416, "reward_std": 0.26269760727882385, "rewards/reward_fn/mean": 3.933260440826416, "rewards/reward_fn/std": 0.26269757747650146, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 490.1875, "completions/mean_terminated_length": 490.1875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.22796223613026415, "frac_reward_zero_std": 0.0, "grad_norm": 1.3125, "kl": 0.020313676446676254, "learning_rate": 7.1407999999999995e-06, "loss": 0.0864, "num_tokens": 99298218.0, "reward": 2.649317979812622, "reward_std": 0.5141263604164124, "rewards/reward_fn/mean": 2.649317979812622, "rewards/reward_fn/std": 0.5141263604164124, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 312.34375, "completions/mean_terminated_length": 312.34375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.22806831441603903, "frac_reward_zero_std": 0.0, "grad_norm": 1.75, "kl": 0.02623810595832765, "learning_rate": 7.1403999999999994e-06, "loss": -0.079, "num_tokens": 99346709.0, "reward": 3.2548115253448486, "reward_std": 0.5868596434593201, "rewards/reward_fn/mean": 3.2548115253448486, "rewards/reward_fn/std": 0.5868596434593201, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1001.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 285.4375, "completions/mean_terminated_length": 285.4375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.22817439270181394, "frac_reward_zero_std": 0.0, "grad_norm": 1.8984375, "kl": 0.027033630991354585, "learning_rate": 7.139999999999999e-06, "loss": 0.1689, "num_tokens": 99391395.0, "reward": 3.732773780822754, "reward_std": 0.5501604676246643, "rewards/reward_fn/mean": 3.732773780822754, "rewards/reward_fn/std": 0.5501604676246643, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 210.59375, "completions/mean_terminated_length": 210.59375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.22828047098758883, "frac_reward_zero_std": 0.0, "grad_norm": 2.015625, "kl": 0.01814991922583431, "learning_rate": 7.139599999999999e-06, "loss": -0.0006, "num_tokens": 99410710.0, "reward": 2.9454665184020996, "reward_std": 0.04778565838932991, "rewards/reward_fn/mean": 2.9454665184020996, "rewards/reward_fn/std": 0.04778566583991051, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 308.875, "completions/mean_terminated_length": 308.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.22838654927336374, "frac_reward_zero_std": 1.0, "grad_norm": 0.0947265625, "kl": 0.024135290645062923, "learning_rate": 7.139199999999999e-06, "loss": 0.001, "num_tokens": 99458034.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 292.5625, "completions/mean_terminated_length": 292.5625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.22849262755913866, "frac_reward_zero_std": 0.0, "grad_norm": 1.8125, "kl": 0.03052612068131566, "learning_rate": 7.138799999999999e-06, "loss": -0.0475, "num_tokens": 99498948.0, "reward": 3.8826375007629395, "reward_std": 0.3714655637741089, "rewards/reward_fn/mean": 3.8826375007629395, "rewards/reward_fn/std": 0.3714655637741089, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1912.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 348.375, "completions/mean_terminated_length": 348.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.22859870584491354, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.02576355915516615, "learning_rate": 7.138399999999999e-06, "loss": -0.0272, "num_tokens": 99553680.0, "reward": 3.7404799461364746, "reward_std": 0.5951921939849854, "rewards/reward_fn/mean": 3.7404799461364746, "rewards/reward_fn/std": 0.5951921343803406, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 297.3125, "completions/mean_terminated_length": 297.3125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.22870478413068845, "frac_reward_zero_std": 0.0, "grad_norm": 1.5234375, "kl": 0.02024422027170658, "learning_rate": 7.137999999999999e-06, "loss": 0.0165, "num_tokens": 99598522.0, "reward": 2.861987352371216, "reward_std": 0.38427045941352844, "rewards/reward_fn/mean": 2.861987352371216, "rewards/reward_fn/std": 0.38427045941352844, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 166.09375, "completions/mean_terminated_length": 166.09375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.22881086241646334, "frac_reward_zero_std": 0.0, "grad_norm": 2.421875, "kl": 0.02221654006280005, "learning_rate": 7.137599999999999e-06, "loss": -0.0167, "num_tokens": 99634173.0, "reward": 3.8790369033813477, "reward_std": 0.28946876525878906, "rewards/reward_fn/mean": 3.8790369033813477, "rewards/reward_fn/std": 0.28946876525878906, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1931.0, "completions/max_terminated_length": 1931.0, "completions/mean_length": 596.8125, "completions/mean_terminated_length": 596.8125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.22891694070223825, "frac_reward_zero_std": 0.0, "grad_norm": 1.25, "kl": 0.022596941329538822, "learning_rate": 7.1372e-06, "loss": 0.0232, "num_tokens": 99696119.0, "reward": 2.7338013648986816, "reward_std": 0.1886308789253235, "rewards/reward_fn/mean": 2.7338013648986816, "rewards/reward_fn/std": 0.1886308640241623, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1288.0, "completions/max_terminated_length": 1288.0, "completions/mean_length": 437.3125, "completions/mean_terminated_length": 437.3125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.22902301898801317, "frac_reward_zero_std": 0.0, "grad_norm": 1.8671875, "kl": 0.028475699247792363, "learning_rate": 7.1368e-06, "loss": -0.198, "num_tokens": 99739841.0, "reward": 3.0553183555603027, "reward_std": 0.39695262908935547, "rewards/reward_fn/mean": 3.0553183555603027, "rewards/reward_fn/std": 0.3969525992870331, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 102.15625, "completions/mean_terminated_length": 102.15625, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.22912909727378805, "frac_reward_zero_std": 1.0, "grad_norm": 0.08642578125, "kl": 0.013394350535236299, "learning_rate": 7.1364e-06, "loss": 0.0005, "num_tokens": 99773542.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 370.6875, "completions/mean_terminated_length": 370.6875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.22923517555956296, "frac_reward_zero_std": 0.0, "grad_norm": 1.3203125, "kl": 0.01727902633138001, "learning_rate": 7.136e-06, "loss": 0.1644, "num_tokens": 99804604.0, "reward": 2.8176543712615967, "reward_std": 0.04095921292901039, "rewards/reward_fn/mean": 2.8176543712615967, "rewards/reward_fn/std": 0.04095920920372009, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 149.78125, "completions/mean_terminated_length": 149.78125, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.22934125384533785, "frac_reward_zero_std": 1.0, "grad_norm": 0.091796875, "kl": 0.021210170816630125, "learning_rate": 7.1356e-06, "loss": 0.0008, "num_tokens": 99853013.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1352.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 299.90625, "completions/mean_terminated_length": 299.90625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.22944733213111276, "frac_reward_zero_std": 0.0, "grad_norm": 1.7109375, "kl": 0.028875578893348575, "learning_rate": 7.1352e-06, "loss": 0.0038, "num_tokens": 99914418.0, "reward": 2.7917895317077637, "reward_std": 0.2923561930656433, "rewards/reward_fn/mean": 2.7917895317077637, "rewards/reward_fn/std": 0.2923561930656433, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 156.875, "completions/mean_terminated_length": 156.875, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.22955341041688768, "frac_reward_zero_std": 1.0, "grad_norm": 0.1083984375, "kl": 0.02055112540256232, "learning_rate": 7.1348e-06, "loss": 0.0008, "num_tokens": 99960398.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 149.875, "completions/mean_terminated_length": 149.875, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.22965948870266256, "frac_reward_zero_std": 0.0, "grad_norm": 2.953125, "kl": 0.02863681223243475, "learning_rate": 7.1344e-06, "loss": -0.0067, "num_tokens": 99982122.0, "reward": 3.969954013824463, "reward_std": 0.1699649542570114, "rewards/reward_fn/mean": 3.969954013824463, "rewards/reward_fn/std": 0.1699649542570114, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1111.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 504.25, "completions/mean_terminated_length": 504.25, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.22976556698843748, "frac_reward_zero_std": 0.0, "grad_norm": 1.2734375, "kl": 0.01761721633374691, "learning_rate": 7.134e-06, "loss": 0.0124, "num_tokens": 100038514.0, "reward": 3.1369893550872803, "reward_std": 0.5092772841453552, "rewards/reward_fn/mean": 3.1369893550872803, "rewards/reward_fn/std": 0.5092772841453552, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1461.0, "completions/max_terminated_length": 1461.0, "completions/mean_length": 400.40625, "completions/mean_terminated_length": 400.40625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.22987164527421236, "frac_reward_zero_std": 0.0, "grad_norm": 0.8671875, "kl": 0.02728833886794746, "learning_rate": 7.1336e-06, "loss": 0.1206, "num_tokens": 100080255.0, "reward": 2.9163427352905273, "reward_std": 0.20467211306095123, "rewards/reward_fn/mean": 2.9163427352905273, "rewards/reward_fn/std": 0.20467209815979004, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 252.90625, "completions/mean_terminated_length": 252.90625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.22997772355998727, "frac_reward_zero_std": 0.0, "grad_norm": 2.25, "kl": 0.026255525881424546, "learning_rate": 7.1332e-06, "loss": 0.1577, "num_tokens": 100104924.0, "reward": 3.8455944061279297, "reward_std": 0.3646887540817261, "rewards/reward_fn/mean": 3.8455944061279297, "rewards/reward_fn/std": 0.3646887540817261, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 268.25, "completions/mean_terminated_length": 268.25, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.23008380184576216, "frac_reward_zero_std": 0.0, "grad_norm": 1.625, "kl": 0.020109534729272127, "learning_rate": 7.132799999999999e-06, "loss": 0.0003, "num_tokens": 100135012.0, "reward": 3.9615554809570312, "reward_std": 0.2174752801656723, "rewards/reward_fn/mean": 3.9615554809570312, "rewards/reward_fn/std": 0.2174752801656723, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 283.25, "completions/mean_terminated_length": 283.25, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.23018988013153707, "frac_reward_zero_std": 0.0, "grad_norm": 2.046875, "kl": 0.020035719266161323, "learning_rate": 7.1324e-06, "loss": 0.0479, "num_tokens": 100157420.0, "reward": 3.928600788116455, "reward_std": 0.4038942754268646, "rewards/reward_fn/mean": 3.928600788116455, "rewards/reward_fn/std": 0.4038942754268646, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 414.53125, "completions/mean_terminated_length": 414.53125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.23029595841731199, "frac_reward_zero_std": 0.0, "grad_norm": 1.34375, "kl": 0.027303512208163738, "learning_rate": 7.132e-06, "loss": -0.0721, "num_tokens": 100210237.0, "reward": 3.4726829528808594, "reward_std": 0.6791224479675293, "rewards/reward_fn/mean": 3.4726829528808594, "rewards/reward_fn/std": 0.6791225075721741, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 204.09375, "completions/mean_terminated_length": 204.09375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.23040203670308687, "frac_reward_zero_std": 0.0, "grad_norm": 1.9609375, "kl": 0.02836342342197895, "learning_rate": 7.1316e-06, "loss": -0.026, "num_tokens": 100255296.0, "reward": 3.934906005859375, "reward_std": 0.25615838170051575, "rewards/reward_fn/mean": 3.934906005859375, "rewards/reward_fn/std": 0.25615841150283813, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 283.34375, "completions/mean_terminated_length": 283.34375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.23050811498886178, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.028128183213993907, "learning_rate": 7.1312e-06, "loss": 0.1033, "num_tokens": 100299659.0, "reward": 3.6705162525177, "reward_std": 0.551688551902771, "rewards/reward_fn/mean": 3.6705162525177, "rewards/reward_fn/std": 0.5516886115074158, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 352.8125, "completions/mean_terminated_length": 352.8125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.23061419327463667, "frac_reward_zero_std": 0.0, "grad_norm": 1.2890625, "kl": 0.01739080680999905, "learning_rate": 7.1307999999999996e-06, "loss": 0.1207, "num_tokens": 100346725.0, "reward": 2.7642221450805664, "reward_std": 0.028661344200372696, "rewards/reward_fn/mean": 2.7642221450805664, "rewards/reward_fn/std": 0.028661338612437248, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 126.09375, "completions/mean_terminated_length": 126.09375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.23072027156041158, "frac_reward_zero_std": 1.0, "grad_norm": 0.06591796875, "kl": 0.011620030330959707, "learning_rate": 7.1303999999999995e-06, "loss": 0.0005, "num_tokens": 100413096.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1259.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 360.71875, "completions/mean_terminated_length": 360.71875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.2308263498461865, "frac_reward_zero_std": 0.0, "grad_norm": 1.6015625, "kl": 0.024960935348644853, "learning_rate": 7.1299999999999995e-06, "loss": 0.0453, "num_tokens": 100461247.0, "reward": 3.6098623275756836, "reward_std": 0.7507240176200867, "rewards/reward_fn/mean": 3.6098623275756836, "rewards/reward_fn/std": 0.7507238984107971, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 261.90625, "completions/mean_terminated_length": 261.90625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.23093242813196138, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.02353836433030665, "learning_rate": 7.1295999999999995e-06, "loss": 0.0189, "num_tokens": 100510204.0, "reward": 2.9872453212738037, "reward_std": 0.39124536514282227, "rewards/reward_fn/mean": 2.9872453212738037, "rewards/reward_fn/std": 0.3912453353404999, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 328.5625, "completions/mean_terminated_length": 328.5625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2310385064177363, "frac_reward_zero_std": 0.0, "grad_norm": 1.9375, "kl": 0.026427945122122765, "learning_rate": 7.1291999999999994e-06, "loss": 0.0886, "num_tokens": 100564046.0, "reward": 3.81813645362854, "reward_std": 0.5137379169464111, "rewards/reward_fn/mean": 3.81813645362854, "rewards/reward_fn/std": 0.5137379169464111, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 349.96875, "completions/mean_terminated_length": 349.96875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.23114458470351118, "frac_reward_zero_std": 0.0, "grad_norm": 2.5625, "kl": 0.02832777169533074, "learning_rate": 7.128799999999999e-06, "loss": -0.0493, "num_tokens": 100610381.0, "reward": 2.8107922077178955, "reward_std": 0.21972812712192535, "rewards/reward_fn/mean": 2.8107922077178955, "rewards/reward_fn/std": 0.21972811222076416, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 290.75, "completions/mean_terminated_length": 290.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.2312506629892861, "frac_reward_zero_std": 1.0, "grad_norm": 0.078125, "kl": 0.021952015929855406, "learning_rate": 7.128399999999999e-06, "loss": 0.0009, "num_tokens": 100650533.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2048.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 441.8125, "completions/mean_terminated_length": 390.0, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.231356741275061, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.029917727690190077, "learning_rate": 7.128e-06, "loss": 0.2161, "num_tokens": 100676063.0, "reward": 2.702650547027588, "reward_std": 0.7632204294204712, "rewards/reward_fn/mean": 2.702650547027588, "rewards/reward_fn/std": 0.7632204294204712, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1366.0, "completions/max_terminated_length": 1366.0, "completions/mean_length": 250.1875, "completions/mean_terminated_length": 250.1875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.2314628195608359, "frac_reward_zero_std": 0.0, "grad_norm": 2.0, "kl": 0.030789189273491502, "learning_rate": 7.1276e-06, "loss": -0.0116, "num_tokens": 100717029.0, "reward": 3.1781158447265625, "reward_std": 0.33881837129592896, "rewards/reward_fn/mean": 3.1781158447265625, "rewards/reward_fn/std": 0.33881837129592896, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 170.90625, "completions/mean_terminated_length": 170.90625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.2315688978466108, "frac_reward_zero_std": 1.0, "grad_norm": 0.12060546875, "kl": 0.02682259352877736, "learning_rate": 7.1272e-06, "loss": 0.0011, "num_tokens": 100746018.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 286.375, "completions/mean_terminated_length": 286.375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.2316749761323857, "frac_reward_zero_std": 1.0, "grad_norm": 0.08544921875, "kl": 0.022002801997587085, "learning_rate": 7.1268e-06, "loss": 0.0009, "num_tokens": 100793678.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1891.0, "completions/max_terminated_length": 1891.0, "completions/mean_length": 431.5625, "completions/mean_terminated_length": 431.5625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.2317810544181606, "frac_reward_zero_std": 0.0, "grad_norm": 1.5390625, "kl": 0.02914230595342815, "learning_rate": 7.1264e-06, "loss": 0.0501, "num_tokens": 100838336.0, "reward": 3.2587783336639404, "reward_std": 0.5467434525489807, "rewards/reward_fn/mean": 3.2587783336639404, "rewards/reward_fn/std": 0.5467433929443359, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 211.625, "completions/mean_terminated_length": 211.625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.23188713270393552, "frac_reward_zero_std": 0.0, "grad_norm": 2.3125, "kl": 0.033715502824634314, "learning_rate": 7.126e-06, "loss": 0.1925, "num_tokens": 100882452.0, "reward": 3.87423038482666, "reward_std": 0.43363064527511597, "rewards/reward_fn/mean": 3.87423038482666, "rewards/reward_fn/std": 0.43363064527511597, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1048.0, "completions/max_terminated_length": 1048.0, "completions/mean_length": 257.4375, "completions/mean_terminated_length": 257.4375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.2319932109897104, "frac_reward_zero_std": 0.0, "grad_norm": 2.125, "kl": 0.020584039855748415, "learning_rate": 7.1256e-06, "loss": -0.0418, "num_tokens": 100914338.0, "reward": 3.1321568489074707, "reward_std": 0.38132020831108093, "rewards/reward_fn/mean": 3.1321568489074707, "rewards/reward_fn/std": 0.3813202381134033, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 137.0, "completions/max_terminated_length": 137.0, "completions/mean_length": 105.65625, "completions/mean_terminated_length": 105.65625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.23209928927548532, "frac_reward_zero_std": 0.0, "grad_norm": 1.609375, "kl": 0.021333031821995974, "learning_rate": 7.1252e-06, "loss": -0.0669, "num_tokens": 100957143.0, "reward": 3.848611831665039, "reward_std": 0.7179643511772156, "rewards/reward_fn/mean": 3.848611831665039, "rewards/reward_fn/std": 0.7179643511772156, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 154.25, "completions/mean_terminated_length": 154.25, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2322053675612602, "frac_reward_zero_std": 0.0, "grad_norm": 2.0625, "kl": 0.02006891486234963, "learning_rate": 7.124799999999999e-06, "loss": 0.1186, "num_tokens": 100992351.0, "reward": 2.937816619873047, "reward_std": 0.04478609934449196, "rewards/reward_fn/mean": 2.937816619873047, "rewards/reward_fn/std": 0.04478614032268524, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1383.0, "completions/max_terminated_length": 1383.0, "completions/mean_length": 438.84375, "completions/mean_terminated_length": 438.84375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.23231144584703511, "frac_reward_zero_std": 1.0, "grad_norm": 0.07470703125, "kl": 0.02082824590615928, "learning_rate": 7.124399999999999e-06, "loss": 0.0008, "num_tokens": 101040506.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 199.46875, "completions/mean_terminated_length": 199.46875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.23241752413281003, "frac_reward_zero_std": 1.0, "grad_norm": 0.11767578125, "kl": 0.028772379737347364, "learning_rate": 7.123999999999999e-06, "loss": 0.0012, "num_tokens": 101063753.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 197.0, "completions/max_terminated_length": 197.0, "completions/mean_length": 133.25, "completions/mean_terminated_length": 133.25, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.2325236024185849, "frac_reward_zero_std": 0.0, "grad_norm": 2.984375, "kl": 0.022556824376806617, "learning_rate": 7.123599999999999e-06, "loss": 0.0406, "num_tokens": 101085489.0, "reward": 3.968625068664551, "reward_std": 0.17748311161994934, "rewards/reward_fn/mean": 3.968625068664551, "rewards/reward_fn/std": 0.17748311161994934, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 243.8125, "completions/mean_terminated_length": 243.8125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.23262968070435983, "frac_reward_zero_std": 0.0, "grad_norm": 1.7578125, "kl": 0.027054782956838608, "learning_rate": 7.1232e-06, "loss": 0.0451, "num_tokens": 101123755.0, "reward": 2.9340295791625977, "reward_std": 0.03773088380694389, "rewards/reward_fn/mean": 2.9340295791625977, "rewards/reward_fn/std": 0.03773083910346031, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 197.21875, "completions/mean_terminated_length": 197.21875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.2327357589901347, "frac_reward_zero_std": 1.0, "grad_norm": 0.109375, "kl": 0.031393368961289525, "learning_rate": 7.1228e-06, "loss": 0.0013, "num_tokens": 101164114.0, "reward": 4.0, "reward_std": 0.0, "rewards/reward_fn/mean": 4.0, "rewards/reward_fn/std": 0.0, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 533.09375, "completions/mean_terminated_length": 533.09375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.23284183727590962, "frac_reward_zero_std": 0.0, "grad_norm": 1.2421875, "kl": 0.022313715890049934, "learning_rate": 7.1224e-06, "loss": -0.0249, "num_tokens": 101220981.0, "reward": 3.169848918914795, "reward_std": 0.4511135220527649, "rewards/reward_fn/mean": 3.169848918914795, "rewards/reward_fn/std": 0.4511135220527649, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 453.90625, "completions/mean_terminated_length": 453.90625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.2329479155616845, "frac_reward_zero_std": 0.0, "grad_norm": 1.5625, "kl": 0.02297700964845717, "learning_rate": 7.122e-06, "loss": -0.101, "num_tokens": 101275314.0, "reward": 2.5342376232147217, "reward_std": 0.5728961229324341, "rewards/reward_fn/mean": 2.5342376232147217, "rewards/reward_fn/std": 0.5728961229324341, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 361.46875, "completions/mean_terminated_length": 361.46875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.23305399384745942, "frac_reward_zero_std": 0.0, "grad_norm": 1.78125, "kl": 0.034721347503364086, "learning_rate": 7.1216e-06, "loss": -0.0032, "num_tokens": 101320385.0, "reward": 2.7689619064331055, "reward_std": 0.25961360335350037, "rewards/reward_fn/mean": 2.7689619064331055, "rewards/reward_fn/std": 0.259613573551178, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 448.03125, "completions/mean_terminated_length": 448.03125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.23316007213323434, "frac_reward_zero_std": 0.0, "grad_norm": 1.6640625, "kl": 0.034088447922840714, "learning_rate": 7.1212e-06, "loss": -0.0266, "num_tokens": 101363330.0, "reward": 3.3539974689483643, "reward_std": 0.6181737780570984, "rewards/reward_fn/mean": 3.3539974689483643, "rewards/reward_fn/std": 0.6181737780570984, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 352.78125, "completions/mean_terminated_length": 352.78125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.23326615041900922, "frac_reward_zero_std": 0.0, "grad_norm": 1.328125, "kl": 0.02180984802544117, "learning_rate": 7.1208e-06, "loss": 0.0872, "num_tokens": 101414651.0, "reward": 2.8594231605529785, "reward_std": 0.05167490243911743, "rewards/reward_fn/mean": 2.8594231605529785, "rewards/reward_fn/std": 0.05167488753795624, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1237.0, "completions/max_terminated_length": 1237.0, "completions/mean_length": 328.75, "completions/mean_terminated_length": 328.75, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.23337222870478413, "frac_reward_zero_std": 0.0, "grad_norm": 1.875, "kl": 0.026791174663230777, "learning_rate": 7.1204e-06, "loss": 0.1349, "num_tokens": 101459891.0, "reward": 3.7417798042297363, "reward_std": 0.6406286358833313, "rewards/reward_fn/mean": 3.7417798042297363, "rewards/reward_fn/std": 0.6406285762786865, "step": 2200 } ], "logging_steps": 1, "max_steps": 20000, "num_input_tokens_seen": 101459891, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }