{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage_max": 1.4294447675347328, "advantage_mean": 1.8626452602532595e-09, "advantage_min": -0.8205335959792137, "advantage_std": 0.805392861366272, "completion_length": 2571.2083587646484, "epoch": 0.001142857142857143, "grad_norm": 0.1222195252776146, "kl": 0.0, "lambda_div_used": 0.6, "learning_rate": 2e-08, "loss": 0.0641, "reward": 0.06657012924551964, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06657012924551964, "reward_after_std": 0.805392861366272, "reward_before_mean": 0.4897647276520729, "reward_before_std": 0.8290339298546314, "reward_change_max": 0.0005614385008811951, "reward_change_mean": -0.4231945872306824, "reward_change_min": -0.8292400389909744, "reward_change_std": 0.33647667057812214, "reward_std": 0.8053928762674332, "rewards/cosine_scaled_reward": -0.015534311532974243, "rewards/format_reward": 0.5208333488553762, "step": 1 }, { "advantage_max": 0.8193490244448185, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -0.4622782990336418, "advantage_std": 0.4655082952231169, "completion_length": 2804.395881652832, "epoch": 0.002285714285714286, "grad_norm": 0.0620713047683239, "kl": 0.0, "lambda_div_used": 0.6, "learning_rate": 4e-08, "loss": 0.0241, "reward": -0.11615866981446743, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11615866981446743, "reward_after_std": 0.4655082933604717, "reward_before_mean": 0.27539755403995514, "reward_before_std": 0.42092561535537243, "reward_change_max": 0.0013062208890914917, "reward_change_mean": -0.39155622851103544, "reward_change_min": -0.6376443430781364, "reward_change_std": 0.26012564916163683, "reward_std": 0.46550831012427807, "rewards/cosine_scaled_reward": -0.04980122856795788, "rewards/format_reward": 0.37500000558793545, "step": 2 }, { "advantage_max": 1.1013623401522636, "advantage_mean": 6.208817460162663e-09, "advantage_min": -0.48275332152843475, "advantage_std": 0.5911364443600178, "completion_length": 3411.6666870117188, "epoch": 0.0034285714285714284, "grad_norm": 0.10215908288955688, "kl": 4.32431697845459e-05, "lambda_div_used": 0.6, "learning_rate": 6e-08, "loss": -0.0046, "reward": -0.4395183250308037, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4395183250308037, "reward_after_std": 0.5911364406347275, "reward_before_mean": -0.2608025949448347, "reward_before_std": 0.6000149585306644, "reward_change_max": 0.0014719441533088684, "reward_change_mean": -0.17871572636067867, "reward_change_min": -0.3646555207669735, "reward_change_std": 0.16313384287059307, "reward_std": 0.5911364704370499, "rewards/cosine_scaled_reward": -0.21373463701456785, "rewards/format_reward": 0.16666667349636555, "step": 3 }, { "advantage_max": 1.765574298799038, "advantage_mean": -1.4280279569955923e-08, "advantage_min": -0.7085130885243416, "advantage_std": 0.9218316338956356, "completion_length": 2263.8333892822266, "epoch": 0.004571428571428572, "grad_norm": 0.1338474303483963, "kl": 4.819035530090332e-05, "lambda_div_used": 0.6, "learning_rate": 8e-08, "loss": 0.0427, "reward": 0.06497885985299945, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06497885985299945, "reward_after_std": 0.9218316301703453, "reward_before_mean": 0.4491391107439995, "reward_before_std": 0.858651926741004, "reward_change_max": 0.0004479065537452698, "reward_change_mean": -0.384160247631371, "reward_change_min": -0.755715049803257, "reward_change_std": 0.284343633800745, "reward_std": 0.9218316525220871, "rewards/cosine_scaled_reward": -0.09834712743759155, "rewards/format_reward": 0.6458333414047956, "step": 4 }, { "advantage_max": 1.026982732117176, "advantage_mean": 7.450580596923828e-09, "advantage_min": -0.5032768584787846, "advantage_std": 0.5551957823336124, "completion_length": 3247.812530517578, "epoch": 0.005714285714285714, "grad_norm": 0.105912946164608, "kl": 4.468671977519989e-05, "lambda_div_used": 0.6, "learning_rate": 1e-07, "loss": 0.0086, "reward": -0.41966925258748233, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.41966925258748233, "reward_after_std": 0.5551957786083221, "reward_before_mean": -0.22130390163511038, "reward_before_std": 0.5660610608756542, "reward_change_max": 0.0005140230059623718, "reward_change_mean": -0.19836535304784775, "reward_change_min": -0.41412150859832764, "reward_change_std": 0.1705097910016775, "reward_std": 0.5551958009600639, "rewards/cosine_scaled_reward": -0.2669019568711519, "rewards/format_reward": 0.31250001303851604, "step": 5 }, { "advantage_max": 1.353376865386963, "advantage_mean": 2.545615063187512e-08, "advantage_min": -0.5026888102293015, "advantage_std": 0.6921062879264355, "completion_length": 3013.5834045410156, "epoch": 0.006857142857142857, "grad_norm": 0.14181621372699738, "kl": 4.999339580535889e-05, "lambda_div_used": 0.6, "learning_rate": 1.2e-07, "loss": 0.0561, "reward": -0.324100736528635, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.324100736528635, "reward_after_std": 0.6921062991023064, "reward_before_mean": -0.1107131689786911, "reward_before_std": 0.6496838089078665, "reward_change_max": 0.0008455067873001099, "reward_change_mean": -0.21338756661862135, "reward_change_min": -0.3878238834440708, "reward_change_std": 0.15879615675657988, "reward_std": 0.6921063400804996, "rewards/cosine_scaled_reward": -0.1907732579857111, "rewards/format_reward": 0.27083333767950535, "step": 6 }, { "advantage_max": 1.4423074126243591, "advantage_mean": 2.5766591998932498e-08, "advantage_min": -0.8147472143173218, "advantage_std": 0.8471020106226206, "completion_length": 3183.5000915527344, "epoch": 0.008, "grad_norm": 0.15475912392139435, "kl": 2.7105212211608887e-05, "lambda_div_used": 0.6, "learning_rate": 1.4e-07, "loss": 0.0486, "reward": -0.029259571339935064, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.029259571339935064, "reward_after_std": 0.8471020236611366, "reward_before_mean": 0.34296554513275623, "reward_before_std": 0.9372109081596136, "reward_change_max": 0.0007103309035301208, "reward_change_mean": -0.3722251045401208, "reward_change_min": -0.8584106490015984, "reward_change_std": 0.3660194616531953, "reward_std": 0.8471020497381687, "rewards/cosine_scaled_reward": -0.0993505665101111, "rewards/format_reward": 0.5416666828095913, "step": 7 }, { "advantage_max": 1.623491793870926, "advantage_mean": -1.8626449826975033e-09, "advantage_min": -0.9304697960615158, "advantage_std": 0.9237196668982506, "completion_length": 2693.1667098999023, "epoch": 0.009142857142857144, "grad_norm": 0.15012045204639435, "kl": 2.0734965801239014e-05, "lambda_div_used": 0.6, "learning_rate": 1.6e-07, "loss": 0.0278, "reward": 0.279265059158206, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.279265059158206, "reward_after_std": 0.9237196668982506, "reward_before_mean": 0.8001202214509249, "reward_before_std": 0.9343188181519508, "reward_change_max": 0.0, "reward_change_mean": -0.5208551697432995, "reward_change_min": -0.9651905745267868, "reward_change_std": 0.40238416008651257, "reward_std": 0.9237196817994118, "rewards/cosine_scaled_reward": 0.160476787481457, "rewards/format_reward": 0.47916668094694614, "step": 8 }, { "advantage_max": 1.3903833664953709, "advantage_mean": -5.898376481683343e-09, "advantage_min": -0.7002911232411861, "advantage_std": 0.7612915430217981, "completion_length": 3116.7708740234375, "epoch": 0.010285714285714285, "grad_norm": 0.1323755532503128, "kl": 3.580749034881592e-05, "lambda_div_used": 0.6, "learning_rate": 1.8e-07, "loss": 0.047, "reward": -0.07552929408848286, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07552929408848286, "reward_after_std": 0.7612915430217981, "reward_before_mean": 0.2753769364207983, "reward_before_std": 0.7574359718710184, "reward_change_max": 0.0, "reward_change_mean": -0.35090621933341026, "reward_change_min": -0.6658867225050926, "reward_change_std": 0.27499448135495186, "reward_std": 0.7612915616482496, "rewards/cosine_scaled_reward": -0.028978207614272833, "rewards/format_reward": 0.3333333395421505, "step": 9 }, { "advantage_max": 1.6468712911009789, "advantage_mean": 1.8626452269465688e-08, "advantage_min": -0.6253786683082581, "advantage_std": 0.8856963291764259, "completion_length": 2866.791702270508, "epoch": 0.011428571428571429, "grad_norm": 0.13161250948905945, "kl": 2.7856789529323578e-05, "lambda_div_used": 0.6, "learning_rate": 2e-07, "loss": 0.0504, "reward": -0.18678228557109833, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.18678228557109833, "reward_after_std": 0.8856963291764259, "reward_before_mean": 0.07037698850035667, "reward_before_std": 0.9093780852854252, "reward_change_max": 0.00042964518070220947, "reward_change_mean": -0.25715925730764866, "reward_change_min": -0.6563235446810722, "reward_change_std": 0.25602648686617613, "reward_std": 0.8856963478028774, "rewards/cosine_scaled_reward": -0.1314781814289745, "rewards/format_reward": 0.33333334140479565, "step": 10 }, { "advantage_max": 1.1373428963124752, "advantage_mean": 2.6387472956690416e-08, "advantage_min": -0.5235589370131493, "advantage_std": 0.6247740015387535, "completion_length": 3366.125, "epoch": 0.012571428571428572, "grad_norm": 0.10979026556015015, "kl": 3.491342067718506e-05, "lambda_div_used": 0.6, "learning_rate": 2.1999999999999998e-07, "loss": 0.0204, "reward": -0.3875628258101642, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3875628258101642, "reward_after_std": 0.6247740034013987, "reward_before_mean": -0.1830587424337864, "reward_before_std": 0.653161009773612, "reward_change_max": 0.0008605122566223145, "reward_change_mean": -0.2045040603261441, "reward_change_min": -0.4535864554345608, "reward_change_std": 0.19186571810860187, "reward_std": 0.6247740127146244, "rewards/cosine_scaled_reward": -0.17486270423978567, "rewards/format_reward": 0.16666666977107525, "step": 11 }, { "advantage_max": 1.6733285710215569, "advantage_mean": -6.208817127095756e-09, "advantage_min": -0.7657397910952568, "advantage_std": 0.9073885902762413, "completion_length": 2669.312515258789, "epoch": 0.013714285714285714, "grad_norm": 0.11108776926994324, "kl": 4.035234451293945e-05, "lambda_div_used": 0.6, "learning_rate": 2.4e-07, "loss": 0.0246, "reward": 0.05015108606312424, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.05015108606312424, "reward_after_std": 0.9073885828256607, "reward_before_mean": 0.43837890587747097, "reward_before_std": 0.901796817779541, "reward_change_max": 0.001746349036693573, "reward_change_mean": -0.38822780828922987, "reward_change_min": -0.7632462754845619, "reward_change_std": 0.29466503486037254, "reward_std": 0.9073885828256607, "rewards/cosine_scaled_reward": -0.06206055777147412, "rewards/format_reward": 0.562500013038516, "step": 12 }, { "advantage_max": 1.2397056221961975, "advantage_mean": 1.0554989993138975e-08, "advantage_min": -0.5433732494711876, "advantage_std": 0.6575523428618908, "completion_length": 2878.437530517578, "epoch": 0.014857142857142857, "grad_norm": 0.09044164419174194, "kl": 3.176182508468628e-05, "lambda_div_used": 0.6, "learning_rate": 2.6e-07, "loss": 0.0186, "reward": -0.06514125317335129, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.06514125317335129, "reward_after_std": 0.6575523391366005, "reward_before_mean": 0.3063340336084366, "reward_before_std": 0.5916530340909958, "reward_change_max": 0.0, "reward_change_mean": -0.3714753072708845, "reward_change_min": -0.6489017568528652, "reward_change_std": 0.25239738542586565, "reward_std": 0.657552357763052, "rewards/cosine_scaled_reward": -0.06558297201991081, "rewards/format_reward": 0.4375000074505806, "step": 13 }, { "advantage_max": 1.3778835982084274, "advantage_mean": 2.4835269951672956e-09, "advantage_min": -0.6107060834765434, "advantage_std": 0.7284672744572163, "completion_length": 3030.0833740234375, "epoch": 0.016, "grad_norm": 0.12247473001480103, "kl": 3.055110573768616e-05, "lambda_div_used": 0.6, "learning_rate": 2.8e-07, "loss": 0.0256, "reward": -0.21163499914109707, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21163499914109707, "reward_after_std": 0.7284672446548939, "reward_before_mean": 0.06481372565031052, "reward_before_std": 0.7108139060437679, "reward_change_max": 0.0006881281733512878, "reward_change_mean": -0.2764487350359559, "reward_change_min": -0.5137332789599895, "reward_change_std": 0.20831829216331244, "reward_std": 0.7284672893583775, "rewards/cosine_scaled_reward": -0.12384314276278019, "rewards/format_reward": 0.3125000074505806, "step": 14 }, { "advantage_max": 0.962163083255291, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.540690753608942, "advantage_std": 0.5459639094769955, "completion_length": 2782.9166831970215, "epoch": 0.017142857142857144, "grad_norm": 0.05843517929315567, "kl": 2.6464462280273438e-05, "lambda_div_used": 0.6, "learning_rate": 3e-07, "loss": 0.0283, "reward": -0.02799556404352188, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.02799556404352188, "reward_after_std": 0.54596390388906, "reward_before_mean": 0.3959492538124323, "reward_before_std": 0.5094255739822984, "reward_change_max": 0.0006490200757980347, "reward_change_mean": -0.42394478945061564, "reward_change_min": -0.6969937682151794, "reward_change_std": 0.284216845408082, "reward_std": 0.5459639206528664, "rewards/cosine_scaled_reward": 0.010474616661667824, "rewards/format_reward": 0.3750000037252903, "step": 15 }, { "advantage_max": 0.548663005232811, "advantage_mean": 2.9181441818515452e-08, "advantage_min": -0.3422815389931202, "advantage_std": 0.3159701582044363, "completion_length": 3456.2708435058594, "epoch": 0.018285714285714287, "grad_norm": 0.0581691712141037, "kl": 4.1425228118896484e-05, "lambda_div_used": 0.6, "learning_rate": 3.2e-07, "loss": 0.0218, "reward": -0.6043956913053989, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.6043956913053989, "reward_after_std": 0.31597016006708145, "reward_before_mean": -0.4554057829082012, "reward_before_std": 0.3371294569224119, "reward_change_max": 0.002277083694934845, "reward_change_mean": -0.1489899060688913, "reward_change_min": -0.2999635115265846, "reward_change_std": 0.1290613072924316, "reward_std": 0.3159701693803072, "rewards/cosine_scaled_reward": -0.2485362235456705, "rewards/format_reward": 0.0416666679084301, "step": 16 }, { "advantage_max": 1.6136809401214123, "advantage_mean": -7.45058070794613e-09, "advantage_min": -0.9330508001148701, "advantage_std": 0.9109026230871677, "completion_length": 2188.1667098999023, "epoch": 0.019428571428571427, "grad_norm": 0.12020345032215118, "kl": 3.744661808013916e-05, "lambda_div_used": 0.6, "learning_rate": 3.4000000000000003e-07, "loss": -0.0071, "reward": 0.3284926589112729, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3284926589112729, "reward_after_std": 0.9109026528894901, "reward_before_mean": 0.8773855976760387, "reward_before_std": 0.9138228315860033, "reward_change_max": 0.0, "reward_change_mean": -0.5488929115235806, "reward_change_min": -0.981883842498064, "reward_change_std": 0.39753549825400114, "reward_std": 0.9109026566147804, "rewards/cosine_scaled_reward": 0.10535944253206253, "rewards/format_reward": 0.6666666716337204, "step": 17 }, { "advantage_max": 1.1283956617116928, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.6506418436765671, "advantage_std": 0.6350066661834717, "completion_length": 3028.291717529297, "epoch": 0.02057142857142857, "grad_norm": 0.11939737945795059, "kl": 2.0876526832580566e-05, "lambda_div_used": 0.6, "learning_rate": 3.6e-07, "loss": 0.0498, "reward": -0.245563886128366, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.245563886128366, "reward_after_std": 0.6350066922605038, "reward_before_mean": 0.040940748527646065, "reward_before_std": 0.6616333723068237, "reward_change_max": 0.00138884037733078, "reward_change_mean": -0.2865046225488186, "reward_change_min": -0.576061837375164, "reward_change_std": 0.24392648972570896, "reward_std": 0.6350067257881165, "rewards/cosine_scaled_reward": -0.13577963784337044, "rewards/format_reward": 0.31250000931322575, "step": 18 }, { "advantage_max": 1.8192770816385746, "advantage_mean": -8.692343955729598e-09, "advantage_min": -0.8841699659824371, "advantage_std": 0.9976575020700693, "completion_length": 2959.000030517578, "epoch": 0.021714285714285714, "grad_norm": 0.2074793130159378, "kl": 2.925097942352295e-05, "lambda_div_used": 0.6, "learning_rate": 3.7999999999999996e-07, "loss": 0.0619, "reward": 0.20424228720366955, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.20424228720366955, "reward_after_std": 0.9976575020700693, "reward_before_mean": 0.6615649559535086, "reward_before_std": 1.0007536001503468, "reward_change_max": 0.0005467459559440613, "reward_change_mean": -0.45732265897095203, "reward_change_min": -0.8358132503926754, "reward_change_std": 0.358495632186532, "reward_std": 0.9976575709879398, "rewards/cosine_scaled_reward": 0.12244914239272475, "rewards/format_reward": 0.4166666753590107, "step": 19 }, { "advantage_max": 1.7250901013612747, "advantage_mean": -1.8005570590062803e-08, "advantage_min": -0.7911568731069565, "advantage_std": 0.9380798451602459, "completion_length": 2311.916706085205, "epoch": 0.022857142857142857, "grad_norm": 0.13781826198101044, "kl": 1.3463897630572319e-05, "lambda_div_used": 0.6, "learning_rate": 4e-07, "loss": 0.055, "reward": 0.33410761249251664, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.33410761249251664, "reward_after_std": 0.9380798451602459, "reward_before_mean": 0.8728911895304918, "reward_before_std": 0.8704607700929046, "reward_change_max": 0.0, "reward_change_mean": -0.538783598691225, "reward_change_min": -0.9747267179191113, "reward_change_std": 0.384306114166975, "reward_std": 0.9380798749625683, "rewards/cosine_scaled_reward": 0.08227891783462837, "rewards/format_reward": 0.7083333488553762, "step": 20 }, { "advantage_max": 1.2891832739114761, "advantage_mean": 1.8626453157644107e-09, "advantage_min": -0.4525096267461777, "advantage_std": 0.6711226403713226, "completion_length": 2708.604202270508, "epoch": 0.024, "grad_norm": 0.08989676833152771, "kl": 4.710257053375244e-05, "lambda_div_used": 0.6, "learning_rate": 4.1999999999999995e-07, "loss": 0.0325, "reward": -0.022732137236744165, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.022732137236744165, "reward_after_std": 0.6711226366460323, "reward_before_mean": 0.36769186705350876, "reward_before_std": 0.5723638404160738, "reward_change_max": 0.0, "reward_change_mean": -0.3904240126721561, "reward_change_min": -0.6175281815230846, "reward_change_std": 0.2368222870863974, "reward_std": 0.6711226478219032, "rewards/cosine_scaled_reward": -0.03490406461060047, "rewards/format_reward": 0.43750000558793545, "step": 21 }, { "advantage_max": 1.127735674381256, "advantage_mean": 1.862645149230957e-09, "advantage_min": -0.5973842702805996, "advantage_std": 0.6144330948591232, "completion_length": 1753.437515258789, "epoch": 0.025142857142857144, "grad_norm": 0.07954632490873337, "kl": 1.9297003746032715e-05, "lambda_div_used": 0.6, "learning_rate": 4.3999999999999997e-07, "loss": 0.0189, "reward": 0.18515374744310975, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18515374744310975, "reward_after_std": 0.6144330874085426, "reward_before_mean": 0.707999374717474, "reward_before_std": 0.525731585919857, "reward_change_max": 0.0008831322193145752, "reward_change_mean": -0.5228456184267998, "reward_change_min": -0.8473505116999149, "reward_change_std": 0.32211419753730297, "reward_std": 0.6144330948591232, "rewards/cosine_scaled_reward": -0.02100032288581133, "rewards/format_reward": 0.7500000111758709, "step": 22 }, { "advantage_max": 1.9255974665284157, "advantage_mean": 1.8005570145973593e-08, "advantage_min": -0.7307319566607475, "advantage_std": 0.9828944765031338, "completion_length": 2248.062515258789, "epoch": 0.026285714285714287, "grad_norm": 0.14250506460666656, "kl": 3.49581241607666e-05, "lambda_div_used": 0.6, "learning_rate": 4.6e-07, "loss": 0.0199, "reward": 0.05501225683838129, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.05501225683838129, "reward_after_std": 0.9828944765031338, "reward_before_mean": 0.4148938748985529, "reward_before_std": 0.9027231149375439, "reward_change_max": 0.002379797399044037, "reward_change_mean": -0.359881600830704, "reward_change_min": -0.6697551794350147, "reward_change_std": 0.25272045843303204, "reward_std": 0.9828945063054562, "rewards/cosine_scaled_reward": -0.07380307232961059, "rewards/format_reward": 0.5625000074505806, "step": 23 }, { "advantage_max": 1.5978022366762161, "advantage_mean": -1.0554989549049765e-08, "advantage_min": -0.925018347799778, "advantage_std": 0.9181984178721905, "completion_length": 2784.166732788086, "epoch": 0.027428571428571427, "grad_norm": 0.14002500474452972, "kl": 1.7780810594558716e-05, "lambda_div_used": 0.6, "learning_rate": 4.8e-07, "loss": 0.0686, "reward": 0.12083139270544052, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.12083139270544052, "reward_after_std": 0.9181984104216099, "reward_before_mean": 0.5569423735141754, "reward_before_std": 0.97861173376441, "reward_change_max": 0.0005804151296615601, "reward_change_mean": -0.4361109873279929, "reward_change_min": -0.9359410665929317, "reward_change_std": 0.37820393592119217, "reward_std": 0.918198436498642, "rewards/cosine_scaled_reward": 0.03888785373419523, "rewards/format_reward": 0.4791666753590107, "step": 24 }, { "advantage_max": 1.394439235329628, "advantage_mean": 4.346172199909404e-09, "advantage_min": -0.758088156580925, "advantage_std": 0.7881841994822025, "completion_length": 2770.2083740234375, "epoch": 0.02857142857142857, "grad_norm": 0.13693547248840332, "kl": 2.9962509870529175e-05, "lambda_div_used": 0.6, "learning_rate": 5e-07, "loss": 0.0448, "reward": -0.1243234477005899, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1243234477005899, "reward_after_std": 0.7881842032074928, "reward_before_mean": 0.19657514989376068, "reward_before_std": 0.8378110863268375, "reward_change_max": 0.0010121017694473267, "reward_change_mean": -0.32089861761778593, "reward_change_min": -0.7076228894293308, "reward_change_std": 0.29621788300573826, "reward_std": 0.7881842367351055, "rewards/cosine_scaled_reward": -0.08921242598444223, "rewards/format_reward": 0.3750000074505806, "step": 25 }, { "advantage_max": 1.2581812031567097, "advantage_mean": 1.8626452158443385e-08, "advantage_min": -0.6945818662643433, "advantage_std": 0.6935589909553528, "completion_length": 2960.2500762939453, "epoch": 0.029714285714285714, "grad_norm": 0.09827658534049988, "kl": 3.141164779663086e-05, "lambda_div_used": 0.6, "learning_rate": 5.2e-07, "loss": 0.0366, "reward": -0.0345622468739748, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0345622468739748, "reward_after_std": 0.6935589741915464, "reward_before_mean": 0.35348133370280266, "reward_before_std": 0.6798408385366201, "reward_change_max": 0.0019709691405296326, "reward_change_mean": -0.3880435563623905, "reward_change_min": -0.6920421347022057, "reward_change_std": 0.28112479858100414, "reward_std": 0.6935589928179979, "rewards/cosine_scaled_reward": -0.05242600850760937, "rewards/format_reward": 0.45833334140479565, "step": 26 }, { "advantage_max": 1.429284494370222, "advantage_mean": -1.4901161526914564e-08, "advantage_min": -0.7001686692237854, "advantage_std": 0.803062416613102, "completion_length": 3211.750030517578, "epoch": 0.030857142857142857, "grad_norm": 0.14858104288578033, "kl": 2.596527338027954e-05, "lambda_div_used": 0.6, "learning_rate": 5.4e-07, "loss": 0.0311, "reward": -0.09592162817716599, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.09592162817716599, "reward_after_std": 0.8030624315142632, "reward_before_mean": 0.24063999578356743, "reward_before_std": 0.8367564044892788, "reward_change_max": 0.0014395639300346375, "reward_change_mean": -0.3365616141818464, "reward_change_min": -0.6913486272096634, "reward_change_std": 0.30171436443924904, "reward_std": 0.8030624389648438, "rewards/cosine_scaled_reward": -0.08801335096359253, "rewards/format_reward": 0.41666667349636555, "step": 27 }, { "advantage_max": 1.9752441011369228, "advantage_mean": -1.490116136038111e-08, "advantage_min": -0.8696260899305344, "advantage_std": 1.0674383416771889, "completion_length": 2980.5833740234375, "epoch": 0.032, "grad_norm": 0.1666395366191864, "kl": 2.1807849407196045e-05, "lambda_div_used": 0.6, "learning_rate": 5.6e-07, "loss": 0.0266, "reward": 0.13897611014544964, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13897611014544964, "reward_after_std": 1.06743835657835, "reward_before_mean": 0.544797046110034, "reward_before_std": 1.0800247061997652, "reward_change_max": 0.0, "reward_change_mean": -0.4058209341019392, "reward_change_min": -0.8129916898906231, "reward_change_std": 0.3283620811998844, "reward_std": 1.067438393831253, "rewards/cosine_scaled_reward": 0.05364852462662384, "rewards/format_reward": 0.4375000074505806, "step": 28 }, { "advantage_max": 1.1171382665634155, "advantage_mean": 4.967054101356894e-09, "advantage_min": -0.5580497160553932, "advantage_std": 0.620572954416275, "completion_length": 3343.291717529297, "epoch": 0.03314285714285714, "grad_norm": 0.11576099693775177, "kl": 1.6135862097144127e-05, "lambda_div_used": 0.6, "learning_rate": 5.8e-07, "loss": 0.0468, "reward": -0.3974270708858967, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3974270708858967, "reward_after_std": 0.6205729730427265, "reward_before_mean": -0.19575086934491992, "reward_before_std": 0.6581322774291039, "reward_change_max": 0.0009334981441497803, "reward_change_mean": -0.20167620666325092, "reward_change_min": -0.5106723494827747, "reward_change_std": 0.20898526348173618, "reward_std": 0.6205730102956295, "rewards/cosine_scaled_reward": -0.18120876979082823, "rewards/format_reward": 0.1666666679084301, "step": 29 }, { "advantage_max": 1.5233756005764008, "advantage_mean": -1.9868215850316062e-08, "advantage_min": -0.8757916316390038, "advantage_std": 0.8625713251531124, "completion_length": 3079.1459045410156, "epoch": 0.03428571428571429, "grad_norm": 0.14153487980365753, "kl": 1.2964010238647461e-05, "lambda_div_used": 0.6, "learning_rate": 6e-07, "loss": 0.0369, "reward": 0.13468949683010578, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13468949683010578, "reward_after_std": 0.8625713251531124, "reward_before_mean": 0.5874797366559505, "reward_before_std": 0.8919285908341408, "reward_change_max": 0.00054217129945755, "reward_change_mean": -0.4527902854606509, "reward_change_min": -0.9095434695482254, "reward_change_std": 0.36228089965879917, "reward_std": 0.8625713437795639, "rewards/cosine_scaled_reward": 0.0749898748472333, "rewards/format_reward": 0.43750001676380634, "step": 30 }, { "advantage_max": 1.368198987096548, "advantage_mean": -5.587935447692871e-09, "advantage_min": -0.773968905210495, "advantage_std": 0.7986135166138411, "completion_length": 2872.312515258789, "epoch": 0.03542857142857143, "grad_norm": 0.0970737561583519, "kl": 1.2390315532684326e-05, "lambda_div_used": 0.6, "learning_rate": 6.2e-07, "loss": -0.0173, "reward": -0.04627075418829918, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.04627075418829918, "reward_after_std": 0.7986135128885508, "reward_before_mean": 0.32107847463339567, "reward_before_std": 0.8681342117488384, "reward_change_max": 0.0013233870267868042, "reward_change_mean": -0.36734923627227545, "reward_change_min": -0.8485586605966091, "reward_change_std": 0.3394074449315667, "reward_std": 0.7986135166138411, "rewards/cosine_scaled_reward": -0.03737742733210325, "rewards/format_reward": 0.3958333358168602, "step": 31 }, { "advantage_max": 1.283915750682354, "advantage_mean": 2.1730860499946658e-08, "advantage_min": -0.5879904553294182, "advantage_std": 0.6885505132377148, "completion_length": 2963.0208740234375, "epoch": 0.036571428571428574, "grad_norm": 0.11873957514762878, "kl": 2.668425440788269e-05, "lambda_div_used": 0.6, "learning_rate": 6.4e-07, "loss": 0.0583, "reward": 0.021974913775920868, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.021974913775920868, "reward_after_std": 0.6885505355894566, "reward_before_mean": 0.4379478101618588, "reward_before_std": 0.6379991136491299, "reward_change_max": 0.0, "reward_change_mean": -0.4159728898666799, "reward_change_min": -0.7113733068108559, "reward_change_std": 0.2826508111320436, "reward_std": 0.6885505504906178, "rewards/cosine_scaled_reward": -0.020609423518180847, "rewards/format_reward": 0.4791666753590107, "step": 32 }, { "advantage_max": 1.6484842039644718, "advantage_mean": 4.967054045845742e-09, "advantage_min": -0.7562173083424568, "advantage_std": 0.9086056165397167, "completion_length": 3314.9791870117188, "epoch": 0.037714285714285714, "grad_norm": 0.12317641079425812, "kl": 3.3482909202575684e-05, "lambda_div_used": 0.6, "learning_rate": 6.6e-07, "loss": 0.0104, "reward": -0.08508192864246666, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08508192864246666, "reward_after_std": 0.9086056537926197, "reward_before_mean": 0.23197847977280617, "reward_before_std": 0.9516045339405537, "reward_change_max": 0.0015941113233566284, "reward_change_mean": -0.31706040538847446, "reward_change_min": -0.7498720176517963, "reward_change_std": 0.30978111177682877, "reward_std": 0.9086056780070066, "rewards/cosine_scaled_reward": -0.0506774433888495, "rewards/format_reward": 0.3333333395421505, "step": 33 }, { "advantage_max": 1.4823268465697765, "advantage_mean": -2.483526928553914e-08, "advantage_min": -0.8470183648169041, "advantage_std": 0.8238020017743111, "completion_length": 2463.958351135254, "epoch": 0.038857142857142854, "grad_norm": 0.10921584069728851, "kl": 2.8777867555618286e-05, "lambda_div_used": 0.6, "learning_rate": 6.800000000000001e-07, "loss": 0.0203, "reward": 0.26816181279718876, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.26816181279718876, "reward_after_std": 0.8238019905984402, "reward_before_mean": 0.7997418083250523, "reward_before_std": 0.7961917296051979, "reward_change_max": 1.0728836059570312e-05, "reward_change_mean": -0.5315799824893475, "reward_change_min": -0.9185572080314159, "reward_change_std": 0.3667104300111532, "reward_std": 0.8238020315766335, "rewards/cosine_scaled_reward": 0.12903754762373865, "rewards/format_reward": 0.5416666679084301, "step": 34 }, { "advantage_max": 1.7259946167469025, "advantage_mean": 8.071462387349015e-09, "advantage_min": -0.6502956449985504, "advantage_std": 0.8964182175695896, "completion_length": 3076.020854949951, "epoch": 0.04, "grad_norm": 0.1264086663722992, "kl": 5.2697956562042236e-05, "lambda_div_used": 0.6, "learning_rate": 7e-07, "loss": 0.0334, "reward": -0.22558780387043953, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.22558780387043953, "reward_after_std": 0.8964182436466217, "reward_before_mean": 0.004136897623538971, "reward_before_std": 0.8808662742376328, "reward_change_max": 0.0007063820958137512, "reward_change_mean": -0.22972470242530107, "reward_change_min": -0.5383105166256428, "reward_change_std": 0.2124389884993434, "reward_std": 0.8964182622730732, "rewards/cosine_scaled_reward": -0.14376488840207458, "rewards/format_reward": 0.2916666716337204, "step": 35 }, { "advantage_max": 0.8015561476349831, "advantage_mean": 3.725290464995368e-09, "advantage_min": -0.3946578651666641, "advantage_std": 0.44071637094020844, "completion_length": 3414.0208435058594, "epoch": 0.04114285714285714, "grad_norm": 0.07707090675830841, "kl": 4.0883664041757584e-05, "lambda_div_used": 0.6, "learning_rate": 7.2e-07, "loss": 0.012, "reward": -0.5596103649586439, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5596103649586439, "reward_after_std": 0.44071637466549873, "reward_before_mean": -0.41469016298651695, "reward_before_std": 0.4619513005018234, "reward_change_max": 0.0, "reward_change_mean": -0.14492022106423974, "reward_change_min": -0.3504052981734276, "reward_change_std": 0.14387890603393316, "reward_std": 0.44071637839078903, "rewards/cosine_scaled_reward": -0.2802617456763983, "rewards/format_reward": 0.14583333767950535, "step": 36 }, { "advantage_max": 0.782890573143959, "advantage_mean": 3.476937748825293e-08, "advantage_min": -0.3986804038286209, "advantage_std": 0.42814368568360806, "completion_length": 3353.9166870117188, "epoch": 0.04228571428571429, "grad_norm": 0.07412921637296677, "kl": 2.1278858184814453e-05, "lambda_div_used": 0.6, "learning_rate": 7.4e-07, "loss": 0.0032, "reward": -0.49189055524766445, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.49189055524766445, "reward_after_std": 0.4281436949968338, "reward_before_mean": -0.30478277802467346, "reward_before_std": 0.42716532200574875, "reward_change_max": 0.0016405805945396423, "reward_change_mean": -0.18710775269755686, "reward_change_min": -0.38063835352659225, "reward_change_std": 0.14960275805788115, "reward_std": 0.4281436949968338, "rewards/cosine_scaled_reward": -0.25655805692076683, "rewards/format_reward": 0.2083333358168602, "step": 37 }, { "advantage_max": 1.234726544469595, "advantage_mean": 4.9670536017565325e-09, "advantage_min": -0.5854838155210018, "advantage_std": 0.6552534718066454, "completion_length": 3217.9791870117188, "epoch": 0.04342857142857143, "grad_norm": 0.12655900418758392, "kl": 4.662945866584778e-05, "lambda_div_used": 0.6, "learning_rate": 7.599999999999999e-07, "loss": 0.0154, "reward": -0.19940327107906342, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19940327107906342, "reward_after_std": 0.655253479257226, "reward_before_mean": 0.09616225806530565, "reward_before_std": 0.6347245592623949, "reward_change_max": 0.0019334778189659119, "reward_change_mean": -0.2955655427649617, "reward_change_min": -0.5166320390999317, "reward_change_std": 0.21207730285823345, "reward_std": 0.6552534829825163, "rewards/cosine_scaled_reward": -0.05608554696664214, "rewards/format_reward": 0.20833333395421505, "step": 38 }, { "advantage_max": 0.8507803976535797, "advantage_mean": 4.9670543234014986e-09, "advantage_min": -0.5066513679921627, "advantage_std": 0.49880874902009964, "completion_length": 2870.875011444092, "epoch": 0.044571428571428574, "grad_norm": 0.06974208354949951, "kl": 2.384372055530548e-05, "lambda_div_used": 0.6, "learning_rate": 7.799999999999999e-07, "loss": 0.03, "reward": -0.16522181406617165, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.16522181406617165, "reward_after_std": 0.49880874156951904, "reward_before_mean": 0.19637863337993622, "reward_before_std": 0.48516307689715177, "reward_change_max": 0.00154896080493927, "reward_change_mean": -0.36160044465214014, "reward_change_min": -0.6046626195311546, "reward_change_std": 0.2663704315200448, "reward_std": 0.49880874156951904, "rewards/cosine_scaled_reward": -0.12056069076061249, "rewards/format_reward": 0.4375000074505806, "step": 39 }, { "advantage_max": 1.5168475210666656, "advantage_mean": 1.862645371275562e-09, "advantage_min": -0.5961493253707886, "advantage_std": 0.8030089661478996, "completion_length": 2627.8333587646484, "epoch": 0.045714285714285714, "grad_norm": 0.1441635638475418, "kl": 0.0001430148258805275, "lambda_div_used": 0.6, "learning_rate": 8e-07, "loss": 0.0472, "reward": -0.02972988225519657, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.02972988225519657, "reward_after_std": 0.8030089475214481, "reward_before_mean": 0.3298732750117779, "reward_before_std": 0.7454587928950787, "reward_change_max": 0.0009643435478210449, "reward_change_mean": -0.35960315354168415, "reward_change_min": -0.7189357168972492, "reward_change_std": 0.27363201417028904, "reward_std": 0.8030089773237705, "rewards/cosine_scaled_reward": -0.06423003599047661, "rewards/format_reward": 0.4583333395421505, "step": 40 }, { "advantage_max": 1.5067855417728424, "advantage_mean": 1.9247333948868572e-08, "advantage_min": -0.6676128581166267, "advantage_std": 0.813670065253973, "completion_length": 3086.3959045410156, "epoch": 0.046857142857142854, "grad_norm": 0.1339729130268097, "kl": 3.9830803871154785e-05, "lambda_div_used": 0.6, "learning_rate": 8.199999999999999e-07, "loss": 0.0698, "reward": -0.2257232129049953, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2257232129049953, "reward_after_std": 0.8136700578033924, "reward_before_mean": 0.026448657736182213, "reward_before_std": 0.8387161567807198, "reward_change_max": 0.0006530433893203735, "reward_change_mean": -0.2521718628704548, "reward_change_min": -0.6511830650269985, "reward_change_std": 0.24910242576152086, "reward_std": 0.8136700727045536, "rewards/cosine_scaled_reward": -0.15344234509393573, "rewards/format_reward": 0.33333334140479565, "step": 41 }, { "advantage_max": 0.8451377004384995, "advantage_mean": 1.6763806787167823e-08, "advantage_min": -0.3755408897995949, "advantage_std": 0.4480682760477066, "completion_length": 2820.666717529297, "epoch": 0.048, "grad_norm": 0.061046577990055084, "kl": 4.439055919647217e-05, "lambda_div_used": 0.6, "learning_rate": 8.399999999999999e-07, "loss": 0.0256, "reward": -0.43968756031244993, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.43968756031244993, "reward_after_std": 0.4480682760477066, "reward_before_mean": -0.23473063483834267, "reward_before_std": 0.41953370813280344, "reward_change_max": 0.0012653842568397522, "reward_change_mean": -0.204956928268075, "reward_change_min": -0.38732779771089554, "reward_change_std": 0.15360154025256634, "reward_std": 0.4480682946741581, "rewards/cosine_scaled_reward": -0.2736153192818165, "rewards/format_reward": 0.31250000186264515, "step": 42 }, { "advantage_max": 1.198561392724514, "advantage_mean": 1.3659398168108794e-08, "advantage_min": -0.6220279037952423, "advantage_std": 0.6821011230349541, "completion_length": 3063.6875762939453, "epoch": 0.04914285714285714, "grad_norm": 0.12261078506708145, "kl": 7.574260234832764e-05, "lambda_div_used": 0.6, "learning_rate": 8.599999999999999e-07, "loss": 0.0814, "reward": -0.2531822435557842, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.2531822435557842, "reward_after_std": 0.6821011155843735, "reward_before_mean": 0.01838504709303379, "reward_before_std": 0.7341145165264606, "reward_change_max": 0.001272149384021759, "reward_change_mean": -0.27156728971749544, "reward_change_min": -0.6277378126978874, "reward_change_std": 0.2601107247173786, "reward_std": 0.6821011193096638, "rewards/cosine_scaled_reward": -0.1262241369113326, "rewards/format_reward": 0.2708333358168602, "step": 43 }, { "advantage_max": 1.5276354625821114, "advantage_mean": -8.07146305348283e-09, "advantage_min": -0.7380058616399765, "advantage_std": 0.8330572284758091, "completion_length": 2725.2083740234375, "epoch": 0.05028571428571429, "grad_norm": 0.17938436567783356, "kl": 0.00020732451230287552, "lambda_div_used": 0.6, "learning_rate": 8.799999999999999e-07, "loss": 0.0918, "reward": -0.06681879423558712, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.06681879423558712, "reward_after_std": 0.8330572471022606, "reward_before_mean": 0.27059781178832054, "reward_before_std": 0.8473841995000839, "reward_change_max": 0.003085687756538391, "reward_change_mean": -0.33741660602390766, "reward_change_min": -0.7176680006086826, "reward_change_std": 0.28439064137637615, "reward_std": 0.8330572918057442, "rewards/cosine_scaled_reward": -0.06261776690371335, "rewards/format_reward": 0.3958333395421505, "step": 44 }, { "advantage_max": 1.1212849467992783, "advantage_mean": 9.934107758624577e-09, "advantage_min": -0.4997914358973503, "advantage_std": 0.6178267244249582, "completion_length": 3435.9791870117188, "epoch": 0.05142857142857143, "grad_norm": 0.10308519750833511, "kl": 8.343835361301899e-05, "lambda_div_used": 0.6, "learning_rate": 9e-07, "loss": 0.0177, "reward": -0.4351994302123785, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4351994302123785, "reward_after_std": 0.6178267393261194, "reward_before_mean": -0.25422532111406326, "reward_before_std": 0.6577106714248657, "reward_change_max": 0.0012275278568267822, "reward_change_mean": -0.18097413005307317, "reward_change_min": -0.5485908165574074, "reward_change_std": 0.20650537125766277, "reward_std": 0.6178267672657967, "rewards/cosine_scaled_reward": -0.21044599390006624, "rewards/format_reward": 0.1666666716337204, "step": 45 }, { "advantage_max": 0.9788857847452164, "advantage_mean": 1.3659398612198004e-08, "advantage_min": -0.3992095962166786, "advantage_std": 0.5113865826278925, "completion_length": 3239.3125, "epoch": 0.052571428571428575, "grad_norm": 0.08502691239118576, "kl": 0.00022674724459648132, "lambda_div_used": 0.6, "learning_rate": 9.2e-07, "loss": -0.0064, "reward": -0.4699201360344887, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4699201360344887, "reward_after_std": 0.5113865602761507, "reward_before_mean": -0.29295278526842594, "reward_before_std": 0.48858173191547394, "reward_change_max": 0.0021085962653160095, "reward_change_mean": -0.1769673554226756, "reward_change_min": -0.3407178223133087, "reward_change_std": 0.1363795893266797, "reward_std": 0.511386577039957, "rewards/cosine_scaled_reward": -0.22980972193181515, "rewards/format_reward": 0.1666666679084301, "step": 46 }, { "advantage_max": 1.38753230124712, "advantage_mean": -1.1175871117430347e-08, "advantage_min": -0.8971096090972424, "advantage_std": 0.831636069342494, "completion_length": 2720.6667251586914, "epoch": 0.053714285714285714, "grad_norm": 0.11728230863809586, "kl": 0.00011475756764411926, "lambda_div_used": 0.6, "learning_rate": 9.399999999999999e-07, "loss": 0.0287, "reward": 0.16340744495391846, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16340744495391846, "reward_after_std": 0.831636056303978, "reward_before_mean": 0.6462972527369857, "reward_before_std": 0.9100265484303236, "reward_change_max": 0.000314466655254364, "reward_change_mean": -0.482889830134809, "reward_change_min": -0.8927135579288006, "reward_change_std": 0.4036479415372014, "reward_std": 0.831636093556881, "rewards/cosine_scaled_reward": 0.05231529846787453, "rewards/format_reward": 0.5416666734963655, "step": 47 }, { "advantage_max": 1.1383882090449333, "advantage_mean": 1.1175871117430347e-08, "advantage_min": -0.5719936788082123, "advantage_std": 0.62079693749547, "completion_length": 2835.3333740234375, "epoch": 0.054857142857142854, "grad_norm": 0.10334164649248123, "kl": 0.0005565360188484192, "lambda_div_used": 0.6, "learning_rate": 9.6e-07, "loss": 0.0129, "reward": -0.1817401812877506, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.1817401812877506, "reward_after_std": 0.6207969449460506, "reward_before_mean": 0.13473353162407875, "reward_before_std": 0.6070270985364914, "reward_change_max": 0.002810366451740265, "reward_change_mean": -0.3164736973121762, "reward_change_min": -0.6078220754861832, "reward_change_std": 0.23711966536939144, "reward_std": 0.6207969635725021, "rewards/cosine_scaled_reward": -0.1305499104782939, "rewards/format_reward": 0.39583333767950535, "step": 48 }, { "advantage_max": 1.3860639333724976, "advantage_mean": 1.4280280069556284e-08, "advantage_min": -0.6047784183174372, "advantage_std": 0.7415264807641506, "completion_length": 2556.1042098999023, "epoch": 0.056, "grad_norm": 0.10148799419403076, "kl": 0.0002539954148232937, "lambda_div_used": 0.6, "learning_rate": 9.8e-07, "loss": 0.058, "reward": -0.21707582101225853, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21707582101225853, "reward_after_std": 0.7415264658629894, "reward_before_mean": 0.05638875346630812, "reward_before_std": 0.7281204629689455, "reward_change_max": 0.00105363130569458, "reward_change_mean": -0.2734645586460829, "reward_change_min": -0.574739396572113, "reward_change_std": 0.23461730778217316, "reward_std": 0.7415264882147312, "rewards/cosine_scaled_reward": -0.18013896653428674, "rewards/format_reward": 0.4166666716337204, "step": 49 }, { "advantage_max": 1.3879567012190819, "advantage_mean": -5.587935614226325e-09, "advantage_min": -0.6811681613326073, "advantage_std": 0.7566171064972878, "completion_length": 2899.0625228881836, "epoch": 0.05714285714285714, "grad_norm": 0.10399441421031952, "kl": 0.0004333890974521637, "lambda_div_used": 0.6, "learning_rate": 1e-06, "loss": 0.0508, "reward": -0.027900653425604105, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.027900653425604105, "reward_after_std": 0.756617084145546, "reward_before_mean": 0.3487557955086231, "reward_before_std": 0.7330795712769032, "reward_change_max": 0.002227097749710083, "reward_change_mean": -0.3766564065590501, "reward_change_min": -0.6959068775177002, "reward_change_std": 0.29322480224072933, "reward_std": 0.756617110222578, "rewards/cosine_scaled_reward": 0.007711221929639578, "rewards/format_reward": 0.3333333395421505, "step": 50 }, { "advantage_max": 1.3180744349956512, "advantage_mean": -2.1109978987077227e-08, "advantage_min": -0.5348929353058338, "advantage_std": 0.6934761293232441, "completion_length": 2412.520866394043, "epoch": 0.05828571428571429, "grad_norm": 0.11199980974197388, "kl": 0.001006007194519043, "lambda_div_used": 0.6, "learning_rate": 9.999890338174275e-07, "loss": 0.0196, "reward": -0.08373632282018661, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08373632282018661, "reward_after_std": 0.6934761442244053, "reward_before_mean": 0.2698266333900392, "reward_before_std": 0.6321978941559792, "reward_change_max": 7.683038711547852e-05, "reward_change_mean": -0.3535629725083709, "reward_change_min": -0.6554724425077438, "reward_change_std": 0.2407732205465436, "reward_std": 0.6934761591255665, "rewards/cosine_scaled_reward": -0.115086690755561, "rewards/format_reward": 0.5000000037252903, "step": 51 }, { "advantage_max": 1.5817754976451397, "advantage_mean": 2.4835267176115394e-09, "advantage_min": -0.6358248367905617, "advantage_std": 0.8387469574809074, "completion_length": 2993.6875534057617, "epoch": 0.05942857142857143, "grad_norm": 0.11231490969657898, "kl": 0.001201428472995758, "lambda_div_used": 0.6, "learning_rate": 9.999561358041868e-07, "loss": 0.0178, "reward": -0.18798981048166752, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.18798981048166752, "reward_after_std": 0.8387469500303268, "reward_before_mean": 0.07809478137642145, "reward_before_std": 0.8421785943210125, "reward_change_max": 0.001999780535697937, "reward_change_mean": -0.266084595117718, "reward_change_min": -0.5899167135357857, "reward_change_std": 0.22805576538667083, "reward_std": 0.8387469574809074, "rewards/cosine_scaled_reward": -0.11720261455047876, "rewards/format_reward": 0.31250000558793545, "step": 52 }, { "advantage_max": 1.4530688673257828, "advantage_mean": -4.967053657267684e-09, "advantage_min": -0.6794195547699928, "advantage_std": 0.7901476360857487, "completion_length": 2776.8958587646484, "epoch": 0.060571428571428575, "grad_norm": 0.10073976218700409, "kl": 0.0005531087517738342, "lambda_div_used": 0.6, "learning_rate": 9.999013075636804e-07, "loss": 0.0231, "reward": 0.1613441277295351, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1613441277295351, "reward_after_std": 0.7901476416736841, "reward_before_mean": 0.6379873100668192, "reward_before_std": 0.7355969715863466, "reward_change_max": 0.0003918856382369995, "reward_change_mean": -0.47664317348971963, "reward_change_min": -0.8067650347948074, "reward_change_std": 0.32972801849246025, "reward_std": 0.7901476863771677, "rewards/cosine_scaled_reward": 0.04816031642258167, "rewards/format_reward": 0.5416666697710752, "step": 53 }, { "advantage_max": 1.3110667504370213, "advantage_mean": -1.8005569923928988e-08, "advantage_min": -0.7371912263333797, "advantage_std": 0.7567489556968212, "completion_length": 2940.541732788086, "epoch": 0.061714285714285715, "grad_norm": 0.13281145691871643, "kl": 0.0002486109733581543, "lambda_div_used": 0.6, "learning_rate": 9.998245517681593e-07, "loss": 0.0728, "reward": 0.1529210014268756, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1529210014268756, "reward_after_std": 0.7567489566281438, "reward_before_mean": 0.6375395655632019, "reward_before_std": 0.7690273942425847, "reward_change_max": 0.002032041549682617, "reward_change_mean": -0.48461859254166484, "reward_change_min": -0.8870695792138577, "reward_change_std": 0.3667809630278498, "reward_std": 0.7567489612847567, "rewards/cosine_scaled_reward": 0.07918643951416016, "rewards/format_reward": 0.4791666716337204, "step": 54 }, { "advantage_max": 1.3867616951465607, "advantage_mean": 1.2417635808503746e-09, "advantage_min": -0.6409769840538502, "advantage_std": 0.7622390799224377, "completion_length": 3007.229217529297, "epoch": 0.06285714285714286, "grad_norm": 0.11522038280963898, "kl": 0.0010178424417972565, "lambda_div_used": 0.6, "learning_rate": 9.997258721585931e-07, "loss": 0.0388, "reward": -0.011176850646734238, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.011176850646734238, "reward_after_std": 0.7622390948235989, "reward_before_mean": 0.37410153821110725, "reward_before_std": 0.7458370700478554, "reward_change_max": 0.0006311237812042236, "reward_change_mean": -0.3852783814072609, "reward_change_min": -0.6944508105516434, "reward_change_std": 0.2901465371251106, "reward_std": 0.7622391171753407, "rewards/cosine_scaled_reward": -0.00044924020767211914, "rewards/format_reward": 0.37500000558793545, "step": 55 }, { "advantage_max": 1.076439805328846, "advantage_mean": 1.30385160446167e-08, "advantage_min": -0.6210877783596516, "advantage_std": 0.6181384474039078, "completion_length": 3061.854217529297, "epoch": 0.064, "grad_norm": 0.10272736847400665, "kl": 0.0003896951675415039, "lambda_div_used": 0.6, "learning_rate": 9.996052735444862e-07, "loss": 0.0233, "reward": -0.15545228496193886, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.15545228496193886, "reward_after_std": 0.618138462305069, "reward_before_mean": 0.1853547152131796, "reward_before_std": 0.6448741275817156, "reward_change_max": 0.0, "reward_change_mean": -0.3408069796860218, "reward_change_min": -0.6736448742449284, "reward_change_std": 0.27636164613068104, "reward_std": 0.6181384846568108, "rewards/cosine_scaled_reward": -0.09482264146208763, "rewards/format_reward": 0.37500000558793545, "step": 56 }, { "advantage_max": 1.1007253751158714, "advantage_mean": 1.9868215073159945e-08, "advantage_min": -0.5071723088622093, "advantage_std": 0.5871976688504219, "completion_length": 3461.2083435058594, "epoch": 0.06514285714285714, "grad_norm": 0.08488260954618454, "kl": 0.0002168715000152588, "lambda_div_used": 0.6, "learning_rate": 9.994627618036452e-07, "loss": 0.0112, "reward": -0.39496668986976147, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.39496668986976147, "reward_after_std": 0.5871976688504219, "reward_before_mean": -0.18953094072639942, "reward_before_std": 0.5871923677623272, "reward_change_max": 0.00031384825706481934, "reward_change_mean": -0.2054357398301363, "reward_change_min": -0.42640750110149384, "reward_change_std": 0.17269728146493435, "reward_std": 0.5871977023780346, "rewards/cosine_scaled_reward": -0.19893213408067822, "rewards/format_reward": 0.20833334140479565, "step": 57 }, { "advantage_max": 1.5824204310774803, "advantage_mean": -4.03573105489663e-09, "advantage_min": -0.8778524771332741, "advantage_std": 0.8872012719511986, "completion_length": 2214.7917251586914, "epoch": 0.06628571428571428, "grad_norm": 0.13823264837265015, "kl": 0.0023623108863830566, "lambda_div_used": 0.6, "learning_rate": 9.992983438818915e-07, "loss": 0.0361, "reward": 0.3328779856674373, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3328779856674373, "reward_after_std": 0.887201264500618, "reward_before_mean": 0.8885044003836811, "reward_before_std": 0.8764373175799847, "reward_change_max": 9.519606828689575e-05, "reward_change_mean": -0.5556264445185661, "reward_change_min": -1.002270046621561, "reward_change_std": 0.39555412344634533, "reward_std": 0.8872012794017792, "rewards/cosine_scaled_reward": 0.0692522106692195, "rewards/format_reward": 0.7500000074505806, "step": 58 }, { "advantage_max": 1.0460245832800865, "advantage_mean": -3.104408341503273e-09, "advantage_min": -0.6583354324102402, "advantage_std": 0.6132517829537392, "completion_length": 2964.625030517578, "epoch": 0.06742857142857143, "grad_norm": 0.11187250167131424, "kl": 0.0005678483285009861, "lambda_div_used": 0.6, "learning_rate": 9.991120277927223e-07, "loss": 0.0396, "reward": -0.23224904853850603, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.23224904853850603, "reward_after_std": 0.613251805305481, "reward_before_mean": 0.06915994361042976, "reward_before_std": 0.6662236377596855, "reward_change_max": 0.0012992247939109802, "reward_change_mean": -0.30140898609533906, "reward_change_min": -0.6153623089194298, "reward_change_std": 0.2672270992770791, "reward_std": 0.6132518127560616, "rewards/cosine_scaled_reward": -0.13208669982850552, "rewards/format_reward": 0.33333334140479565, "step": 59 }, { "advantage_max": 1.3265932314097881, "advantage_mean": 1.8005570145973593e-08, "advantage_min": -0.5170855298638344, "advantage_std": 0.7021647803485394, "completion_length": 3072.6041870117188, "epoch": 0.06857142857142857, "grad_norm": 0.10385601967573166, "kl": 0.0005446523427963257, "lambda_div_used": 0.6, "learning_rate": 9.989038226169207e-07, "loss": 0.0085, "reward": -0.2865824941545725, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2865824941545725, "reward_after_std": 0.7021647542715073, "reward_before_mean": -0.047466689720749855, "reward_before_std": 0.6895656269043684, "reward_change_max": 0.001022636890411377, "reward_change_mean": -0.23911579558625817, "reward_change_min": -0.5408961176872253, "reward_change_std": 0.20852595707401633, "reward_std": 0.7021647542715073, "rewards/cosine_scaled_reward": -0.19040001556277275, "rewards/format_reward": 0.3333333358168602, "step": 60 }, { "advantage_max": 1.3558772429823875, "advantage_mean": 9.93410786964688e-09, "advantage_min": -0.5892483256757259, "advantage_std": 0.7217530123889446, "completion_length": 3244.791748046875, "epoch": 0.06971428571428571, "grad_norm": 0.1278354525566101, "kl": 0.0008572190999984741, "lambda_div_used": 0.6, "learning_rate": 9.98673738502114e-07, "loss": 0.0538, "reward": -0.23935077455826104, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.23935077455826104, "reward_after_std": 0.7217530133202672, "reward_before_mean": 0.02287537232041359, "reward_before_std": 0.7123109959065914, "reward_change_max": 0.0023524612188339233, "reward_change_mean": -0.26222612289711833, "reward_change_min": -0.5236610397696495, "reward_change_std": 0.217988062184304, "reward_std": 0.721753029152751, "rewards/cosine_scaled_reward": -0.1760623399168253, "rewards/format_reward": 0.3750000074505806, "step": 61 }, { "advantage_max": 1.7632023394107819, "advantage_mean": -4.656613122877573e-09, "advantage_min": -0.9446415901184082, "advantage_std": 1.0154646262526512, "completion_length": 2728.5625228881836, "epoch": 0.07085714285714285, "grad_norm": 0.12908360362052917, "kl": 0.0017460063099861145, "lambda_div_used": 0.6, "learning_rate": 9.98421786662277e-07, "loss": 0.0681, "reward": 0.21019299514591694, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21019299514591694, "reward_after_std": 1.01546461135149, "reward_before_mean": 0.6783958990126848, "reward_before_std": 1.1028874181210995, "reward_change_max": 0.0018347501754760742, "reward_change_mean": -0.4682029206305742, "reward_change_min": -0.968503512442112, "reward_change_std": 0.4169737081974745, "reward_std": 1.015464648604393, "rewards/cosine_scaled_reward": 0.047531288117170334, "rewards/format_reward": 0.5833333395421505, "step": 62 }, { "advantage_max": 1.3256918042898178, "advantage_mean": -8.692344399818808e-09, "advantage_min": -0.7518532276153564, "advantage_std": 0.7779847048223019, "completion_length": 2488.1458740234375, "epoch": 0.072, "grad_norm": 0.14880503714084625, "kl": 0.002031862735748291, "lambda_div_used": 0.6, "learning_rate": 9.981479793771866e-07, "loss": 0.0716, "reward": 0.16536018857732415, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16536018857732415, "reward_after_std": 0.7779847234487534, "reward_before_mean": 0.6565921977162361, "reward_before_std": 0.7892885077744722, "reward_change_max": 0.00045865029096603394, "reward_change_mean": -0.49123201705515385, "reward_change_min": -0.9217961169779301, "reward_change_std": 0.3878398798406124, "reward_std": 0.7779847532510757, "rewards/cosine_scaled_reward": 0.005379423499107361, "rewards/format_reward": 0.6458333358168602, "step": 63 }, { "advantage_max": 1.1469408124685287, "advantage_mean": -6.208820124697922e-10, "advantage_min": -0.7107248418033123, "advantage_std": 0.6794019639492035, "completion_length": 2977.8333740234375, "epoch": 0.07314285714285715, "grad_norm": 0.12988926470279694, "kl": 0.0029876232147216797, "lambda_div_used": 0.6, "learning_rate": 9.97852329991824e-07, "loss": 0.056, "reward": 0.05250055715441704, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05250055715441704, "reward_after_std": 0.6794019490480423, "reward_before_mean": 0.49878887087106705, "reward_before_std": 0.6982266828417778, "reward_change_max": 0.0, "reward_change_mean": -0.4462883062660694, "reward_change_min": -0.7984127178788185, "reward_change_std": 0.34245668537914753, "reward_std": 0.6794019564986229, "rewards/cosine_scaled_reward": 0.04106108099222183, "rewards/format_reward": 0.41666667722165585, "step": 64 }, { "advantage_max": 1.1359502971172333, "advantage_mean": -6.208819014474898e-10, "advantage_min": -0.4511838797479868, "advantage_std": 0.5894279684871435, "completion_length": 2855.6458435058594, "epoch": 0.07428571428571429, "grad_norm": 0.09675120562314987, "kl": 0.0013644695281982422, "lambda_div_used": 0.6, "learning_rate": 9.975348529157229e-07, "loss": -0.0027, "reward": -0.13648290559649467, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13648290559649467, "reward_after_std": 0.5894279796630144, "reward_before_mean": 0.20960132515756413, "reward_before_std": 0.5116656869649887, "reward_change_max": 0.000279448926448822, "reward_change_mean": -0.3460842249915004, "reward_change_min": -0.5561416335403919, "reward_change_std": 0.22418450471013784, "reward_std": 0.5894279852509499, "rewards/cosine_scaled_reward": -0.09311600960791111, "rewards/format_reward": 0.39583333395421505, "step": 65 }, { "advantage_max": 1.126391690224409, "advantage_mean": -4.3461716447978915e-09, "advantage_min": -0.4565571919083595, "advantage_std": 0.5905974991619587, "completion_length": 2221.833351135254, "epoch": 0.07542857142857143, "grad_norm": 0.05661759525537491, "kl": 0.001573324203491211, "lambda_div_used": 0.6, "learning_rate": 9.971955636222684e-07, "loss": -0.0186, "reward": 0.15690111927688122, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.15690111927688122, "reward_after_std": 0.5905974879860878, "reward_before_mean": 0.664361234754324, "reward_before_std": 0.4507758244872093, "reward_change_max": 0.0, "reward_change_mean": -0.5074600903317332, "reward_change_min": -0.7607763223350048, "reward_change_std": 0.2888161540031433, "reward_std": 0.5905974917113781, "rewards/cosine_scaled_reward": 0.08218058943748474, "rewards/format_reward": 0.5, "step": 66 }, { "advantage_max": 0.7976373434066772, "advantage_mean": 1.1175871117430347e-08, "advantage_min": -0.3243073895573616, "advantage_std": 0.41893285885453224, "completion_length": 3426.0416870117188, "epoch": 0.07657142857142857, "grad_norm": 0.06953131407499313, "kl": 0.0017848312854766846, "lambda_div_used": 0.6, "learning_rate": 9.968344786479415e-07, "loss": -0.0187, "reward": -0.590395949780941, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.590395949780941, "reward_after_std": 0.41893285512924194, "reward_before_mean": -0.460202120244503, "reward_before_std": 0.40765188075602055, "reward_change_max": 0.001080058515071869, "reward_change_mean": -0.1301938333781436, "reward_change_min": -0.26982005313038826, "reward_change_std": 0.11151221627369523, "reward_std": 0.41893286257982254, "rewards/cosine_scaled_reward": -0.3134343959391117, "rewards/format_reward": 0.1666666716337204, "step": 67 }, { "advantage_max": 1.418497547507286, "advantage_mean": 0.0, "advantage_min": -0.7557137459516525, "advantage_std": 0.8144563175737858, "completion_length": 2536.7709045410156, "epoch": 0.07771428571428571, "grad_norm": 0.16494296491146088, "kl": 0.0022640228271484375, "lambda_div_used": 0.6, "learning_rate": 9.964516155915151e-07, "loss": 0.0918, "reward": -0.015500393696129322, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.015500393696129322, "reward_after_std": 0.8144563212990761, "reward_before_mean": 0.36400349996984005, "reward_before_std": 0.8677889443933964, "reward_change_max": 0.0013982132077217102, "reward_change_mean": -0.3795038778334856, "reward_change_min": -0.7978788800537586, "reward_change_std": 0.32792997919023037, "reward_std": 0.8144563250243664, "rewards/cosine_scaled_reward": -0.07841494982130826, "rewards/format_reward": 0.5208333414047956, "step": 68 }, { "advantage_max": 0.9903690777719021, "advantage_mean": 1.614292521878724e-08, "advantage_min": -0.49220066517591476, "advantage_std": 0.5437542498111725, "completion_length": 2929.7500381469727, "epoch": 0.07885714285714286, "grad_norm": 0.09613347053527832, "kl": 0.0040988922119140625, "lambda_div_used": 0.6, "learning_rate": 9.960469931131936e-07, "loss": 0.0129, "reward": -0.3549748270306736, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3549748270306736, "reward_after_std": 0.5437542349100113, "reward_before_mean": -0.11749381478875875, "reward_before_std": 0.5448034442961216, "reward_change_max": 0.0013570860028266907, "reward_change_mean": -0.23748101433739066, "reward_change_min": -0.5052936151623726, "reward_change_std": 0.19911040179431438, "reward_std": 0.5437542647123337, "rewards/cosine_scaled_reward": -0.23583024507388473, "rewards/format_reward": 0.35416667349636555, "step": 69 }, { "advantage_max": 1.2011119946837425, "advantage_mean": 9.313226134732844e-09, "advantage_min": -0.43334708362817764, "advantage_std": 0.616494245827198, "completion_length": 3077.083335876465, "epoch": 0.08, "grad_norm": 0.08861793577671051, "kl": 0.00223541259765625, "lambda_div_used": 0.6, "learning_rate": 9.956206309337066e-07, "loss": 0.0137, "reward": -0.30773347429931164, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.30773347429931164, "reward_after_std": 0.6164942122995853, "reward_before_mean": -0.06555812992155552, "reward_before_std": 0.5589950382709503, "reward_change_max": 0.0006213560700416565, "reward_change_mean": -0.24217534251511097, "reward_change_min": -0.40470268577337265, "reward_change_std": 0.15899777598679066, "reward_std": 0.6164942272007465, "rewards/cosine_scaled_reward": -0.18902906961739063, "rewards/format_reward": 0.31250000186264515, "step": 70 }, { "advantage_max": 1.255461797118187, "advantage_mean": 1.4901161637936866e-08, "advantage_min": -0.53419079631567, "advantage_std": 0.6641193814575672, "completion_length": 2718.2292098999023, "epoch": 0.08114285714285714, "grad_norm": 0.12506967782974243, "kl": 0.0030085816979408264, "lambda_div_used": 0.6, "learning_rate": 9.951725498333448e-07, "loss": 0.0502, "reward": -0.00627492368221283, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.00627492368221283, "reward_after_std": 0.6641193926334381, "reward_before_mean": 0.39759753830730915, "reward_before_std": 0.5728695411235094, "reward_change_max": 0.0, "reward_change_mean": -0.4038724033161998, "reward_change_min": -0.6584755666553974, "reward_change_std": 0.2662770180031657, "reward_std": 0.6641194075345993, "rewards/cosine_scaled_reward": -0.009534597164019942, "rewards/format_reward": 0.4166666679084301, "step": 71 }, { "advantage_max": 1.1922738291323185, "advantage_mean": 1.3659397890553038e-08, "advantage_min": -0.5972421020269394, "advantage_std": 0.6567165236920118, "completion_length": 3158.3958740234375, "epoch": 0.08228571428571428, "grad_norm": 0.0950314849615097, "kl": 0.006608843803405762, "lambda_div_used": 0.6, "learning_rate": 9.947027716509488e-07, "loss": 0.0336, "reward": -0.3057239428162575, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3057239428162575, "reward_after_std": 0.6567165162414312, "reward_before_mean": -0.06156452978029847, "reward_before_std": 0.6831415146589279, "reward_change_max": 0.004357524216175079, "reward_change_mean": -0.2441594167612493, "reward_change_min": -0.5717665106058121, "reward_change_std": 0.23254990810528398, "reward_std": 0.6567165348678827, "rewards/cosine_scaled_reward": -0.15578227303922176, "rewards/format_reward": 0.2500000074505806, "step": 72 }, { "advantage_max": 1.1263316199183464, "advantage_mean": 1.9247333282734758e-08, "advantage_min": -0.4545093812048435, "advantage_std": 0.6076869647949934, "completion_length": 3519.8541870117188, "epoch": 0.08342857142857144, "grad_norm": 0.09696004539728165, "kl": 0.0008401870727539062, "lambda_div_used": 0.6, "learning_rate": 9.942113192828444e-07, "loss": 0.0412, "reward": -0.480790832079947, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.480790832079947, "reward_after_std": 0.6076869685202837, "reward_before_mean": -0.3260495774447918, "reward_before_std": 0.6348689384758472, "reward_change_max": 0.0018310844898223877, "reward_change_mean": -0.1547412471845746, "reward_change_min": -0.42637697234749794, "reward_change_std": 0.17630695179104805, "reward_std": 0.6076869815587997, "rewards/cosine_scaled_reward": -0.204691456630826, "rewards/format_reward": 0.0833333358168602, "step": 73 }, { "advantage_max": 1.400854118168354, "advantage_mean": 1.862645193639878e-08, "advantage_min": -0.6330046206712723, "advantage_std": 0.7717082761228085, "completion_length": 3311.854217529297, "epoch": 0.08457142857142858, "grad_norm": 0.13873428106307983, "kl": 0.002074897289276123, "lambda_div_used": 0.6, "learning_rate": 9.93698216681727e-07, "loss": 0.0536, "reward": -0.07318782806396484, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07318782806396484, "reward_after_std": 0.7717083059251308, "reward_before_mean": 0.27929894998669624, "reward_before_std": 0.7782693542540073, "reward_change_max": 0.000939980149269104, "reward_change_mean": -0.35248675756156445, "reward_change_min": -0.6938531585037708, "reward_change_std": 0.29394845431670547, "reward_std": 0.7717083431780338, "rewards/cosine_scaled_reward": -0.006183858960866928, "rewards/format_reward": 0.29166666977107525, "step": 74 }, { "advantage_max": 1.1135927364230156, "advantage_mean": -2.793967834868738e-09, "advantage_min": -0.4344211630523205, "advantage_std": 0.5783823095262051, "completion_length": 3101.500045776367, "epoch": 0.08571428571428572, "grad_norm": 0.29595881700515747, "kl": 0.01248013973236084, "lambda_div_used": 0.6, "learning_rate": 9.931634888554935e-07, "loss": 0.0289, "reward": -0.1404849924147129, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1404849924147129, "reward_after_std": 0.5783823020756245, "reward_before_mean": 0.20374970324337482, "reward_before_std": 0.4970117639750242, "reward_change_max": 0.0015685856342315674, "reward_change_mean": -0.3442347086966038, "reward_change_min": -0.5670420341193676, "reward_change_std": 0.21399930119514465, "reward_std": 0.5783823281526566, "rewards/cosine_scaled_reward": -0.08562515117228031, "rewards/format_reward": 0.37500000558793545, "step": 75 }, { "advantage_max": 1.2363538295030594, "advantage_mean": 2.048909714114089e-08, "advantage_min": -0.5451108776032925, "advantage_std": 0.6710482239723206, "completion_length": 2952.979248046875, "epoch": 0.08685714285714285, "grad_norm": 0.09811766445636749, "kl": 0.001201242208480835, "lambda_div_used": 0.6, "learning_rate": 9.926071618660237e-07, "loss": 0.0344, "reward": -0.23255194473313168, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.23255194473313168, "reward_after_std": 0.6710482239723206, "reward_before_mean": 0.046751671470701694, "reward_before_std": 0.6695912722498178, "reward_change_max": 0.00038267672061920166, "reward_change_mean": -0.2793036075308919, "reward_change_min": -0.600840475410223, "reward_change_std": 0.23344214539974928, "reward_std": 0.6710482500493526, "rewards/cosine_scaled_reward": -0.19537417870014906, "rewards/format_reward": 0.43750000558793545, "step": 76 }, { "advantage_max": 1.0073203220963478, "advantage_mean": -3.1044082304809706e-09, "advantage_min": -0.5661797672510147, "advantage_std": 0.5656730942428112, "completion_length": 3245.1666717529297, "epoch": 0.088, "grad_norm": 0.08500754833221436, "kl": 0.0013560652732849121, "lambda_div_used": 0.6, "learning_rate": 9.9202926282791e-07, "loss": -0.0149, "reward": -0.2388194277882576, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2388194277882576, "reward_after_std": 0.5656730942428112, "reward_before_mean": 0.06565772742033005, "reward_before_std": 0.5782887861132622, "reward_change_max": 0.00035362690687179565, "reward_change_mean": -0.30447718035429716, "reward_change_min": -0.5347645282745361, "reward_change_std": 0.22534830961376429, "reward_std": 0.5656730979681015, "rewards/cosine_scaled_reward": -0.11300447443500161, "rewards/format_reward": 0.29166666977107525, "step": 77 }, { "advantage_max": 1.1894551888108253, "advantage_mean": -1.3659397946064189e-08, "advantage_min": -0.7126917093992233, "advantage_std": 0.6707145310938358, "completion_length": 3217.8958740234375, "epoch": 0.08914285714285715, "grad_norm": 0.0936238095164299, "kl": 0.0016173124313354492, "lambda_div_used": 0.6, "learning_rate": 9.91429819907136e-07, "loss": 0.0368, "reward": -0.13307934533804655, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13307934533804655, "reward_after_std": 0.6707145348191261, "reward_before_mean": 0.20870637288317084, "reward_before_std": 0.6934996079653502, "reward_change_max": 3.166496753692627e-05, "reward_change_mean": -0.34178573824465275, "reward_change_min": -0.6631423011422157, "reward_change_std": 0.2715051304548979, "reward_std": 0.6707145571708679, "rewards/cosine_scaled_reward": -0.051896817167289555, "rewards/format_reward": 0.31250000931322575, "step": 78 }, { "advantage_max": 1.5434688106179237, "advantage_mean": -3.1044088966147854e-09, "advantage_min": -0.6260135676711798, "advantage_std": 0.8081723116338253, "completion_length": 2341.9167098999023, "epoch": 0.09028571428571429, "grad_norm": 0.14235326647758484, "kl": 0.018215656280517578, "lambda_div_used": 0.6, "learning_rate": 9.908088623197048e-07, "loss": 0.0089, "reward": 0.02448425441980362, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.02448425441980362, "reward_after_std": 0.808172345161438, "reward_before_mean": 0.41002992913126945, "reward_before_std": 0.7505143824964762, "reward_change_max": 0.000287078320980072, "reward_change_mean": -0.385545676574111, "reward_change_min": -0.648455660790205, "reward_change_std": 0.25417708698660135, "reward_std": 0.8081723637878895, "rewards/cosine_scaled_reward": -0.06581836566329002, "rewards/format_reward": 0.5416666679084301, "step": 79 }, { "advantage_max": 1.3301952853798866, "advantage_mean": -2.793967474046255e-09, "advantage_min": -0.7484759241342545, "advantage_std": 0.7691389471292496, "completion_length": 3285.125, "epoch": 0.09142857142857143, "grad_norm": 0.11320678889751434, "kl": 0.0028287172317504883, "lambda_div_used": 0.6, "learning_rate": 9.901664203302124e-07, "loss": 0.0183, "reward": -0.1481735883280635, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1481735883280635, "reward_after_std": 0.7691389322280884, "reward_before_mean": 0.16942895017564297, "reward_before_std": 0.8429461754858494, "reward_change_max": 0.0006430298089981079, "reward_change_mean": -0.317602532915771, "reward_change_min": -0.7648013271391392, "reward_change_std": 0.31389318499714136, "reward_std": 0.7691389322280884, "rewards/cosine_scaled_reward": -0.09236887097358704, "rewards/format_reward": 0.35416667349636555, "step": 80 }, { "advantage_max": 1.0307377986609936, "advantage_mean": 1.0554990104161277e-08, "advantage_min": -0.5289515219628811, "advantage_std": 0.568504374474287, "completion_length": 3086.416679382324, "epoch": 0.09257142857142857, "grad_norm": 0.11514487117528915, "kl": 0.0069026947021484375, "lambda_div_used": 0.6, "learning_rate": 9.895025252503755e-07, "loss": 0.0102, "reward": -0.19615141674876213, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.19615141674876213, "reward_after_std": 0.568504374474287, "reward_before_mean": 0.12522562127560377, "reward_before_std": 0.5510979443788528, "reward_change_max": 0.0006910786032676697, "reward_change_mean": -0.32137702871114016, "reward_change_min": -0.6005649529397488, "reward_change_std": 0.23301844112575054, "reward_std": 0.568504374474287, "rewards/cosine_scaled_reward": -0.0936371935531497, "rewards/format_reward": 0.31250000558793545, "step": 81 }, { "advantage_max": 1.2566718012094498, "advantage_mean": 1.0554989271494009e-08, "advantage_min": -0.5156496614217758, "advantage_std": 0.6929546110332012, "completion_length": 2985.7708435058594, "epoch": 0.09371428571428571, "grad_norm": 0.09941416233778, "kl": 0.002816915512084961, "lambda_div_used": 0.6, "learning_rate": 9.888172094375033e-07, "loss": 0.0118, "reward": -0.052963174879550934, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.052963174879550934, "reward_after_std": 0.6929546073079109, "reward_before_mean": 0.32549452036619186, "reward_before_std": 0.671553336083889, "reward_change_max": 0.0007572099566459656, "reward_change_mean": -0.37845765706151724, "reward_change_min": -0.8261243067681789, "reward_change_std": 0.30112940445542336, "reward_std": 0.692954633384943, "rewards/cosine_scaled_reward": -0.024752754718065262, "rewards/format_reward": 0.3750000037252903, "step": 82 }, { "advantage_max": 0.7631809785962105, "advantage_mean": -4.967054156868045e-09, "advantage_min": -0.43150240182876587, "advantage_std": 0.4299459084868431, "completion_length": 2912.8333435058594, "epoch": 0.09485714285714286, "grad_norm": 0.04903676360845566, "kl": 0.002282381057739258, "lambda_div_used": 0.6, "learning_rate": 9.881105062929221e-07, "loss": 0.0038, "reward": -0.3834444247186184, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.3834444247186184, "reward_after_std": 0.4299459159374237, "reward_before_mean": -0.1346133127808571, "reward_before_std": 0.4388551339507103, "reward_change_max": 0.0004472807049751282, "reward_change_mean": -0.2488311375491321, "reward_change_min": -0.45463940128684044, "reward_change_std": 0.18461166438646615, "reward_std": 0.4299459271132946, "rewards/cosine_scaled_reward": -0.19230665266513824, "rewards/format_reward": 0.25, "step": 83 }, { "advantage_max": 1.642131645232439, "advantage_mean": -2.1109978320943412e-08, "advantage_min": -0.870296873152256, "advantage_std": 0.9153857529163361, "completion_length": 3176.7291717529297, "epoch": 0.096, "grad_norm": 0.142425537109375, "kl": 0.0012586116790771484, "lambda_div_used": 0.6, "learning_rate": 9.873824502603459e-07, "loss": 0.0224, "reward": 0.1766932848840952, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1766932848840952, "reward_after_std": 0.9153857212513685, "reward_before_mean": 0.6397379375994205, "reward_before_std": 0.9262058734893799, "reward_change_max": 0.0005095526576042175, "reward_change_mean": -0.463044673204422, "reward_change_min": -0.879260279238224, "reward_change_std": 0.361559247598052, "reward_std": 0.9153857212513685, "rewards/cosine_scaled_reward": 0.11153563484549522, "rewards/format_reward": 0.41666667722165585, "step": 84 }, { "advantage_max": 2.0070881322026253, "advantage_mean": -9.313226023710541e-09, "advantage_min": -0.9783229231834412, "advantage_std": 1.1386385187506676, "completion_length": 3160.2500762939453, "epoch": 0.09714285714285714, "grad_norm": 0.1873876303434372, "kl": 0.0019626617431640625, "lambda_div_used": 0.6, "learning_rate": 9.866330768241983e-07, "loss": 0.0419, "reward": 0.12494198745116591, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12494198745116591, "reward_after_std": 1.1386385038495064, "reward_before_mean": 0.5180581012973562, "reward_before_std": 1.2345999665558338, "reward_change_max": 0.0020033270120620728, "reward_change_mean": -0.39311612769961357, "reward_change_min": -1.0354369431734085, "reward_change_std": 0.4217766746878624, "reward_std": 1.1386385075747967, "rewards/cosine_scaled_reward": 0.009029048029333353, "rewards/format_reward": 0.5000000074505806, "step": 85 }, { "advantage_max": 1.3539377562701702, "advantage_mean": 5.587935669737476e-09, "advantage_min": -0.703910693526268, "advantage_std": 0.7683831937611103, "completion_length": 3129.7083435058594, "epoch": 0.09828571428571428, "grad_norm": 0.12810629606246948, "kl": 0.002894878387451172, "lambda_div_used": 0.6, "learning_rate": 9.85862422507884e-07, "loss": 0.0355, "reward": -0.15509214252233505, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.15509214252233505, "reward_after_std": 0.7683831639587879, "reward_before_mean": 0.1555484477430582, "reward_before_std": 0.823885153979063, "reward_change_max": 0.0, "reward_change_mean": -0.310640599578619, "reward_change_min": -0.7123654522001743, "reward_change_std": 0.29474045149981976, "reward_std": 0.7683831863105297, "rewards/cosine_scaled_reward": -0.08889244613237679, "rewards/format_reward": 0.3333333395421505, "step": 86 }, { "advantage_max": 1.1467025578022003, "advantage_mean": 5.587935725248627e-09, "advantage_min": -0.7277822308242321, "advantage_std": 0.6753969453275204, "completion_length": 2852.041717529297, "epoch": 0.09942857142857142, "grad_norm": 0.1169510930776596, "kl": 0.0055196285247802734, "lambda_div_used": 0.6, "learning_rate": 9.850705248720068e-07, "loss": 0.0398, "reward": -0.0842017037793994, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0842017037793994, "reward_after_std": 0.6753969304263592, "reward_before_mean": 0.2875655069947243, "reward_before_std": 0.7284983061254025, "reward_change_max": 0.000680871307849884, "reward_change_mean": -0.3717672023922205, "reward_change_min": -0.7417283318936825, "reward_change_std": 0.3114555370993912, "reward_std": 0.6753969639539719, "rewards/cosine_scaled_reward": -0.08538391441106796, "rewards/format_reward": 0.4583333469927311, "step": 87 }, { "advantage_max": 1.8067155107855797, "advantage_mean": -1.2417635808503746e-09, "advantage_min": -0.9640210494399071, "advantage_std": 0.9902556464076042, "completion_length": 2936.8750610351562, "epoch": 0.10057142857142858, "grad_norm": 0.2037220597267151, "kl": 0.010509967803955078, "lambda_div_used": 0.6, "learning_rate": 9.8425742251254e-07, "loss": 0.1151, "reward": 0.12938768789172173, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12938768789172173, "reward_after_std": 0.9902556501328945, "reward_before_mean": 0.5470981888938695, "reward_before_std": 1.0180102176964283, "reward_change_max": 0.0006925985217094421, "reward_change_mean": -0.417710492387414, "reward_change_min": -0.8149654679000378, "reward_change_std": 0.34481045603752136, "reward_std": 0.9902556911110878, "rewards/cosine_scaled_reward": 0.054799098521471024, "rewards/format_reward": 0.4375000149011612, "step": 88 }, { "advantage_max": 1.1965279504656792, "advantage_mean": 6.829699361610153e-09, "advantage_min": -0.6220619156956673, "advantage_std": 0.6542908251285553, "completion_length": 3258.0416870117188, "epoch": 0.10171428571428572, "grad_norm": 0.12059728056192398, "kl": 0.004296541213989258, "lambda_div_used": 0.6, "learning_rate": 9.83423155058946e-07, "loss": 0.0663, "reward": -0.30891418643295765, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.30891418643295765, "reward_after_std": 0.654290821403265, "reward_before_mean": -0.06621248135343194, "reward_before_std": 0.6760115548968315, "reward_change_max": 0.0009444355964660645, "reward_change_mean": -0.24270169623196125, "reward_change_min": -0.48976127430796623, "reward_change_std": 0.20589512679725885, "reward_std": 0.6542908400297165, "rewards/cosine_scaled_reward": -0.12685624696314335, "rewards/format_reward": 0.1875000074505806, "step": 89 }, { "advantage_max": 1.4118667095899582, "advantage_mean": 3.1044083970144243e-09, "advantage_min": -0.5967099294066429, "advantage_std": 0.7452210150659084, "completion_length": 2658.6250610351562, "epoch": 0.10285714285714286, "grad_norm": 0.12070947885513306, "kl": 0.009579658508300781, "lambda_div_used": 0.6, "learning_rate": 9.825677631722435e-07, "loss": 0.0573, "reward": -0.15686163678765297, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.15686163678765297, "reward_after_std": 0.7452210113406181, "reward_before_mean": 0.14455214142799377, "reward_before_std": 0.7128091156482697, "reward_change_max": 0.0, "reward_change_mean": -0.3014137726277113, "reward_change_min": -0.6083030439913273, "reward_change_std": 0.23668606020510197, "reward_std": 0.7452210150659084, "rewards/cosine_scaled_reward": -0.15689060185104609, "rewards/format_reward": 0.4583333469927311, "step": 90 }, { "advantage_max": 0.9449239484965801, "advantage_mean": 6.208819014474898e-10, "advantage_min": -0.6252702437341213, "advantage_std": 0.5788875985890627, "completion_length": 3106.6458587646484, "epoch": 0.104, "grad_norm": 0.10415603965520859, "kl": 0.004826545715332031, "lambda_div_used": 0.6, "learning_rate": 9.816912885430258e-07, "loss": 0.02, "reward": -0.166163869202137, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.166163869202137, "reward_after_std": 0.5788875948637724, "reward_before_mean": 0.18202120624482632, "reward_before_std": 0.6361364722251892, "reward_change_max": 0.001609407365322113, "reward_change_mean": -0.3481850866228342, "reward_change_min": -0.6712036058306694, "reward_change_std": 0.2946196533739567, "reward_std": 0.5788875967264175, "rewards/cosine_scaled_reward": -0.06523939780890942, "rewards/format_reward": 0.31250000558793545, "step": 91 }, { "advantage_max": 1.4620023369789124, "advantage_mean": 5.551115123125783e-16, "advantage_min": -0.7397854886949062, "advantage_std": 0.8217831328511238, "completion_length": 2918.000015258789, "epoch": 0.10514285714285715, "grad_norm": 0.14798514544963837, "kl": 0.011157512664794922, "lambda_div_used": 0.6, "learning_rate": 9.807937738894303e-07, "loss": 0.0368, "reward": -0.06641942448914051, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.06641942448914051, "reward_after_std": 0.8217831626534462, "reward_before_mean": 0.28103411523625255, "reward_before_std": 0.8633438013494015, "reward_change_max": 0.0023638010025024414, "reward_change_mean": -0.3474535522982478, "reward_change_min": -0.7106242850422859, "reward_change_std": 0.2981195440515876, "reward_std": 0.8217831663787365, "rewards/cosine_scaled_reward": -0.06781627610325813, "rewards/format_reward": 0.4166666753590107, "step": 92 }, { "advantage_max": 0.7241514772176743, "advantage_mean": 2.110997915361068e-08, "advantage_min": -0.3528713136911392, "advantage_std": 0.4079975299537182, "completion_length": 3430.187530517578, "epoch": 0.10628571428571429, "grad_norm": 0.08644148707389832, "kl": 0.0048465728759765625, "lambda_div_used": 0.6, "learning_rate": 9.798752629550546e-07, "loss": 0.0291, "reward": -0.5372234713286161, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5372234713286161, "reward_after_std": 0.4079975299537182, "reward_before_mean": -0.3706090720370412, "reward_before_std": 0.4305304940789938, "reward_change_max": 0.0016954094171524048, "reward_change_mean": -0.16661439649760723, "reward_change_min": -0.40895063802599907, "reward_change_std": 0.15682219434529543, "reward_std": 0.4079975336790085, "rewards/cosine_scaled_reward": -0.23738786298781633, "rewards/format_reward": 0.10416666977107525, "step": 93 }, { "advantage_max": 0.8936146646738052, "advantage_mean": 1.80055704790405e-08, "advantage_min": -0.3542574942111969, "advantage_std": 0.47137969732284546, "completion_length": 3167.958335876465, "epoch": 0.10742857142857143, "grad_norm": 0.07212722301483154, "kl": 0.009177207946777344, "lambda_div_used": 0.6, "learning_rate": 9.78935800506826e-07, "loss": 0.0105, "reward": -0.2902873866260052, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2902873866260052, "reward_after_std": 0.47137970849871635, "reward_before_mean": -0.003107912838459015, "reward_before_std": 0.4016598341986537, "reward_change_max": 0.0, "reward_change_mean": -0.28717945888638496, "reward_change_min": -0.49013088271021843, "reward_change_std": 0.18906648084521294, "reward_std": 0.47137971967458725, "rewards/cosine_scaled_reward": -0.0953039638698101, "rewards/format_reward": 0.18750000186264515, "step": 94 }, { "advantage_max": 1.1060735136270523, "advantage_mean": 1.2417635031347629e-08, "advantage_min": -0.5495634824037552, "advantage_std": 0.6076685786247253, "completion_length": 3469.125030517578, "epoch": 0.10857142857142857, "grad_norm": 0.09135206788778305, "kl": 0.0022430419921875, "lambda_div_used": 0.6, "learning_rate": 9.779754323328192e-07, "loss": 0.0086, "reward": -0.40138646960258484, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.40138646960258484, "reward_after_std": 0.607668599113822, "reward_before_mean": -0.20004860311746597, "reward_before_std": 0.6348323151469231, "reward_change_max": 0.0011594444513320923, "reward_change_mean": -0.20133786648511887, "reward_change_min": -0.4667600132524967, "reward_change_std": 0.20126637630164623, "reward_std": 0.6076686009764671, "rewards/cosine_scaled_reward": -0.19377430342137814, "rewards/format_reward": 0.1875000074505806, "step": 95 }, { "advantage_max": 1.2968962267041206, "advantage_mean": 3.1044084525255755e-09, "advantage_min": -0.5794485211372375, "advantage_std": 0.7141258921474218, "completion_length": 2919.187515258789, "epoch": 0.10971428571428571, "grad_norm": 0.13126961886882782, "kl": 0.007322788238525391, "lambda_div_used": 0.6, "learning_rate": 9.769942052400235e-07, "loss": 0.0576, "reward": -0.05010135751217604, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05010135751217604, "reward_after_std": 0.7141258884221315, "reward_before_mean": 0.32372894510626793, "reward_before_std": 0.6927786152809858, "reward_change_max": 0.0015667006373405457, "reward_change_mean": -0.37383024860173464, "reward_change_min": -0.7615102715790272, "reward_change_std": 0.30468545015901327, "reward_std": 0.714125907048583, "rewards/cosine_scaled_reward": -0.015218888409435749, "rewards/format_reward": 0.3541666679084301, "step": 96 }, { "advantage_max": 1.2440862655639648, "advantage_mean": 2.4835267176115394e-09, "advantage_min": -0.6283608749508858, "advantage_std": 0.6912339441478252, "completion_length": 3318.3959045410156, "epoch": 0.11085714285714286, "grad_norm": 0.13034504652023315, "kl": 0.003994464874267578, "lambda_div_used": 0.6, "learning_rate": 9.759921670520634e-07, "loss": 0.0228, "reward": -0.2075312975794077, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2075312975794077, "reward_after_std": 0.6912339385598898, "reward_before_mean": 0.08721866272389889, "reward_before_std": 0.7230119798332453, "reward_change_max": 0.000818297266960144, "reward_change_mean": -0.29474993934854865, "reward_change_min": -0.6336596198379993, "reward_change_std": 0.2503976479638368, "reward_std": 0.691233953461051, "rewards/cosine_scaled_reward": -0.09180734679102898, "rewards/format_reward": 0.2708333432674408, "step": 97 }, { "advantage_max": 1.195045854896307, "advantage_mean": 9.31322596819939e-09, "advantage_min": -0.535729430615902, "advantage_std": 0.6331109032034874, "completion_length": 3111.5208740234375, "epoch": 0.112, "grad_norm": 0.1344451755285263, "kl": 0.0039520263671875, "lambda_div_used": 0.6, "learning_rate": 9.749693666068663e-07, "loss": 0.0187, "reward": -0.11401326581835747, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.11401326581835747, "reward_after_std": 0.6331108994781971, "reward_before_mean": 0.23613072000443935, "reward_before_std": 0.5695687290281057, "reward_change_max": 0.0017473921179771423, "reward_change_mean": -0.35014398489147425, "reward_change_min": -0.6171726249158382, "reward_change_std": 0.24364902451634407, "reward_std": 0.6331109274178743, "rewards/cosine_scaled_reward": -0.0694346446543932, "rewards/format_reward": 0.3750000111758709, "step": 98 }, { "advantage_max": 1.1984662860631943, "advantage_mean": -2.048909669705168e-08, "advantage_min": -0.555856853723526, "advantage_std": 0.6488207280635834, "completion_length": 2897.708366394043, "epoch": 0.11314285714285714, "grad_norm": 0.11851833760738373, "kl": 0.004616737365722656, "lambda_div_used": 0.6, "learning_rate": 9.739258537542835e-07, "loss": 0.014, "reward": -0.15122101828455925, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.15122101828455925, "reward_after_std": 0.6488207206130028, "reward_before_mean": 0.17770473286509514, "reward_before_std": 0.6168424002826214, "reward_change_max": 0.0008178576827049255, "reward_change_mean": -0.3289257613942027, "reward_change_min": -0.6208729110658169, "reward_change_std": 0.25092875584959984, "reward_std": 0.6488207466900349, "rewards/cosine_scaled_reward": -0.06739764660596848, "rewards/format_reward": 0.31250000558793545, "step": 99 }, { "advantage_max": 1.8971523717045784, "advantage_mean": 1.614292521878724e-08, "advantage_min": -0.8356616273522377, "advantage_std": 1.0355449803173542, "completion_length": 2997.666717529297, "epoch": 0.11428571428571428, "grad_norm": 0.15601147711277008, "kl": 0.0081787109375, "lambda_div_used": 0.6, "learning_rate": 9.728616793536587e-07, "loss": 0.0539, "reward": 0.05242241080850363, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05242241080850363, "reward_after_std": 1.035545002669096, "reward_before_mean": 0.417850723490119, "reward_before_std": 1.0788166895508766, "reward_change_max": 0.0008053779602050781, "reward_change_mean": -0.36542829871177673, "reward_change_min": -0.8287406712770462, "reward_change_std": 0.33487732522189617, "reward_std": 1.0355450212955475, "rewards/cosine_scaled_reward": 0.011008680099621415, "rewards/format_reward": 0.3958333432674408, "step": 100 }, { "advantage_max": 0.9307895302772522, "advantage_mean": 6.208816238917336e-10, "advantage_min": -0.3656721208244562, "advantage_std": 0.4907538667321205, "completion_length": 2841.375015258789, "epoch": 0.11542857142857142, "grad_norm": 0.12105463445186615, "kl": 0.004067897796630859, "lambda_div_used": 0.6, "learning_rate": 9.717768952713511e-07, "loss": -0.012, "reward": -0.17625251971185207, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17625251971185207, "reward_after_std": 0.4907538667321205, "reward_before_mean": 0.16911687143146992, "reward_before_std": 0.4058742057532072, "reward_change_max": 0.0002156868577003479, "reward_change_mean": -0.3453693939372897, "reward_change_min": -0.6092492565512657, "reward_change_std": 0.21868636459112167, "reward_std": 0.4907538704574108, "rewards/cosine_scaled_reward": -0.11335823312401772, "rewards/format_reward": 0.3958333395421505, "step": 101 }, { "advantage_max": 1.196364901959896, "advantage_mean": 1.8626449826975033e-09, "advantage_min": -0.5503832846879959, "advantage_std": 0.6417879574000835, "completion_length": 2956.0209045410156, "epoch": 0.11657142857142858, "grad_norm": 0.12102524191141129, "kl": 0.0120391845703125, "lambda_div_used": 0.6, "learning_rate": 9.706715543782064e-07, "loss": 0.0616, "reward": -0.026549585163593292, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.026549585163593292, "reward_after_std": 0.6417879462242126, "reward_before_mean": 0.3726413017138839, "reward_before_std": 0.5742765087634325, "reward_change_max": 0.0007911324501037598, "reward_change_mean": -0.3991908668540418, "reward_change_min": -0.7056144215166569, "reward_change_std": 0.27402982767671347, "reward_std": 0.6417879611253738, "rewards/cosine_scaled_reward": -0.03242936171591282, "rewards/format_reward": 0.43750000931322575, "step": 102 }, { "advantage_max": 1.367197409272194, "advantage_mean": 2.483527050678447e-09, "advantage_min": -0.724069282412529, "advantage_std": 0.7786473240703344, "completion_length": 2909.666717529297, "epoch": 0.11771428571428572, "grad_norm": 0.1296321302652359, "kl": 0.010121822357177734, "lambda_div_used": 0.6, "learning_rate": 9.695457105469804e-07, "loss": 0.0546, "reward": -0.09430141933262348, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.09430141933262348, "reward_after_std": 0.7786473240703344, "reward_before_mean": 0.24636715522501618, "reward_before_std": 0.8315681144595146, "reward_change_max": 0.00037298351526260376, "reward_change_mean": -0.34066856699064374, "reward_change_min": -0.7915425300598145, "reward_change_std": 0.30810489458963275, "reward_std": 0.7786473408341408, "rewards/cosine_scaled_reward": -0.10598308267071843, "rewards/format_reward": 0.4583333469927311, "step": 103 }, { "advantage_max": 1.1343006566166878, "advantage_mean": 1.4280279514444771e-08, "advantage_min": -0.628552533686161, "advantage_std": 0.6272084675729275, "completion_length": 2978.1666870117188, "epoch": 0.11885714285714286, "grad_norm": 0.09671797603368759, "kl": 0.005688667297363281, "lambda_div_used": 0.6, "learning_rate": 9.683994186497132e-07, "loss": 0.0212, "reward": -0.17992812395095825, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17992812395095825, "reward_after_std": 0.6272084899246693, "reward_before_mean": 0.14061287976801395, "reward_before_std": 0.6303292363882065, "reward_change_max": 0.0005879104137420654, "reward_change_mean": -0.3205409971997142, "reward_change_min": -0.5868665352463722, "reward_change_std": 0.24384531751275063, "reward_std": 0.6272085160017014, "rewards/cosine_scaled_reward": -0.1171935647726059, "rewards/format_reward": 0.3750000111758709, "step": 104 }, { "advantage_max": 1.7707880921661854, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.7792215496301651, "advantage_std": 0.9740794487297535, "completion_length": 3094.437545776367, "epoch": 0.12, "grad_norm": 0.14219453930854797, "kl": 0.005229949951171875, "lambda_div_used": 0.6, "learning_rate": 9.672327345550543e-07, "loss": 0.0383, "reward": -0.00444754958152771, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.00444754958152771, "reward_after_std": 0.9740794561803341, "reward_before_mean": 0.34283736534416676, "reward_before_std": 1.0192876607179642, "reward_change_max": 0.0017241165041923523, "reward_change_mean": -0.3472848879173398, "reward_change_min": -0.8471212461590767, "reward_change_std": 0.3292017253115773, "reward_std": 0.9740794897079468, "rewards/cosine_scaled_reward": -0.0160813401453197, "rewards/format_reward": 0.3750000074505806, "step": 105 }, { "advantage_max": 1.506701335310936, "advantage_mean": -9.934107536579972e-09, "advantage_min": -0.7705876715481281, "advantage_std": 0.8312099389731884, "completion_length": 2482.770927429199, "epoch": 0.12114285714285715, "grad_norm": 0.11971443891525269, "kl": 0.0063114166259765625, "lambda_div_used": 0.6, "learning_rate": 9.66045715125541e-07, "loss": 0.0545, "reward": 0.44777741096913815, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.44777741096913815, "reward_after_std": 0.8312099389731884, "reward_before_mean": 1.0764255952090025, "reward_before_std": 0.735991109162569, "reward_change_max": 0.0008112713694572449, "reward_change_mean": -0.6286481656134129, "reward_change_min": -1.0969063863158226, "reward_change_std": 0.4263636786490679, "reward_std": 0.8312099538743496, "rewards/cosine_scaled_reward": 0.20487945154309273, "rewards/format_reward": 0.666666679084301, "step": 106 }, { "advantage_max": 1.063334859907627, "advantage_mean": 2.3593505593666464e-08, "advantage_min": -0.5757645852863789, "advantage_std": 0.6022264622151852, "completion_length": 2857.020866394043, "epoch": 0.12228571428571429, "grad_norm": 0.10165040194988251, "kl": 0.0062084197998046875, "lambda_div_used": 0.6, "learning_rate": 9.648384182148252e-07, "loss": 0.052, "reward": -0.16400642041116953, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.16400642041116953, "reward_after_std": 0.6022264696657658, "reward_before_mean": 0.17313838889822364, "reward_before_std": 0.6170230749994516, "reward_change_max": 0.00020002573728561401, "reward_change_mean": -0.3371447781100869, "reward_change_min": -0.6786707863211632, "reward_change_std": 0.2606902029365301, "reward_std": 0.6022264733910561, "rewards/cosine_scaled_reward": -0.12176414579153061, "rewards/format_reward": 0.41666667722165585, "step": 107 }, { "advantage_max": 1.5820023342967033, "advantage_mean": 9.313225857177088e-09, "advantage_min": -0.7522710785269737, "advantage_std": 0.8811581470072269, "completion_length": 2883.0626068115234, "epoch": 0.12342857142857143, "grad_norm": 1.1351174116134644, "kl": 0.04792070388793945, "lambda_div_used": 0.6, "learning_rate": 9.636109026648554e-07, "loss": 0.0616, "reward": -0.11980252992361784, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.11980252992361784, "reward_after_std": 0.8811581172049046, "reward_before_mean": 0.18240957101806998, "reward_before_std": 0.9268815889954567, "reward_change_max": 0.002282187342643738, "reward_change_mean": -0.30221210699528456, "reward_change_min": -0.6947237513959408, "reward_change_std": 0.30414726212620735, "reward_std": 0.8811581209301949, "rewards/cosine_scaled_reward": -0.11712856311351061, "rewards/format_reward": 0.416666679084301, "step": 108 }, { "advantage_max": 0.8597342558205128, "advantage_mean": 1.738468857759301e-08, "advantage_min": -0.48682762682437897, "advantage_std": 0.480662290006876, "completion_length": 3087.5, "epoch": 0.12457142857142857, "grad_norm": 0.06071054935455322, "kl": 0.004103660583496094, "lambda_div_used": 0.6, "learning_rate": 9.623632283030077e-07, "loss": -0.0119, "reward": -0.3114382822532207, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3114382822532207, "reward_after_std": 0.4806622937321663, "reward_before_mean": -0.03207859140820801, "reward_before_std": 0.48070384934544563, "reward_change_max": 0.0013087615370750427, "reward_change_mean": -0.2793596927076578, "reward_change_min": -0.5258324146270752, "reward_change_std": 0.2068811203353107, "reward_std": 0.4806623198091984, "rewards/cosine_scaled_reward": -0.1722893062978983, "rewards/format_reward": 0.3125, "step": 109 }, { "advantage_max": 1.2370356619358063, "advantage_mean": -6.829699084054397e-09, "advantage_min": -0.5914059355854988, "advantage_std": 0.666951946914196, "completion_length": 2959.312530517578, "epoch": 0.12571428571428572, "grad_norm": 0.10299086570739746, "kl": 0.006114959716796875, "lambda_div_used": 0.6, "learning_rate": 9.610954559391704e-07, "loss": 0.0401, "reward": -0.28863461688160896, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.28863461688160896, "reward_after_std": 0.6669519459828734, "reward_before_mean": -0.04152660258114338, "reward_before_std": 0.6713255615904927, "reward_change_max": 0.0018312260508537292, "reward_change_mean": -0.2471080287359655, "reward_change_min": -0.517824113368988, "reward_change_std": 0.21439127274788916, "reward_std": 0.6669519720599055, "rewards/cosine_scaled_reward": -0.19784663617610931, "rewards/format_reward": 0.35416667349636555, "step": 110 }, { "advantage_max": 1.3072906248271465, "advantage_mean": 1.862645149230957e-09, "advantage_min": -0.6446176692843437, "advantage_std": 0.7294343803077936, "completion_length": 3483.8958435058594, "epoch": 0.12685714285714286, "grad_norm": 0.13278532028198242, "kl": 0.008441925048828125, "lambda_div_used": 0.6, "learning_rate": 9.598076473627796e-07, "loss": 0.0239, "reward": -0.2520891670137644, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2520891670137644, "reward_after_std": 0.7294343784451485, "reward_before_mean": 0.010047844611108303, "reward_before_std": 0.7737896014004946, "reward_change_max": 0.0, "reward_change_mean": -0.26213700883090496, "reward_change_min": -0.641155794262886, "reward_change_std": 0.25784933008253574, "reward_std": 0.7294344156980515, "rewards/cosine_scaled_reward": -0.0678927511908114, "rewards/format_reward": 0.1458333358168602, "step": 111 }, { "advantage_max": 1.1253357827663422, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.6119440719485283, "advantage_std": 0.6345476061105728, "completion_length": 3364.375030517578, "epoch": 0.128, "grad_norm": 0.10261553525924683, "kl": 0.0039768218994140625, "lambda_div_used": 0.6, "learning_rate": 9.58499865339809e-07, "loss": -0.0321, "reward": -0.06111699063330889, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.06111699063330889, "reward_after_std": 0.6345475874841213, "reward_before_mean": 0.32833828032016754, "reward_before_std": 0.6291491501033306, "reward_change_max": 0.0010995790362358093, "reward_change_mean": -0.38945526140742004, "reward_change_min": -0.7214230000972748, "reward_change_std": 0.2952321572229266, "reward_std": 0.6345476321876049, "rewards/cosine_scaled_reward": 0.007919133640825748, "rewards/format_reward": 0.3125, "step": 112 }, { "advantage_max": 1.6033024415373802, "advantage_mean": -2.483527050678447e-09, "advantage_min": -0.8581812679767609, "advantage_std": 0.9165709689259529, "completion_length": 2931.104202270508, "epoch": 0.12914285714285714, "grad_norm": 0.17789599299430847, "kl": 0.009304046630859375, "lambda_div_used": 0.6, "learning_rate": 9.571721736097088e-07, "loss": 0.0576, "reward": -0.006853158585727215, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.006853158585727215, "reward_after_std": 0.9165709540247917, "reward_before_mean": 0.35799872130155563, "reward_before_std": 0.9939543418586254, "reward_change_max": 0.0007301792502403259, "reward_change_mean": -0.3648518770933151, "reward_change_min": -0.9379028156399727, "reward_change_std": 0.3610955514013767, "reward_std": 0.9165709912776947, "rewards/cosine_scaled_reward": -0.05016732541844249, "rewards/format_reward": 0.4583333358168602, "step": 113 }, { "advantage_max": 1.2760983109474182, "advantage_mean": -1.6653345369377348e-16, "advantage_min": -0.6161659248173237, "advantage_std": 0.6759795695543289, "completion_length": 2618.208339691162, "epoch": 0.13028571428571428, "grad_norm": 0.09680048376321793, "kl": 0.00629425048828125, "lambda_div_used": 0.6, "learning_rate": 9.55824636882301e-07, "loss": 0.0299, "reward": -0.09806261584162712, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.09806261584162712, "reward_after_std": 0.6759795621037483, "reward_before_mean": 0.2520258827134967, "reward_before_std": 0.6346731521189213, "reward_change_max": 6.187707185745239e-05, "reward_change_mean": -0.35008850507438183, "reward_change_min": -0.6094051860272884, "reward_change_std": 0.23877457296475768, "reward_std": 0.6759795732796192, "rewards/cosine_scaled_reward": -0.165653734235093, "rewards/format_reward": 0.5833333469927311, "step": 114 }, { "advantage_max": 1.533675353974104, "advantage_mean": 6.208817404651512e-09, "advantage_min": -0.6436694450676441, "advantage_std": 0.799139654263854, "completion_length": 2905.7500228881836, "epoch": 0.13142857142857142, "grad_norm": 0.107809878885746, "kl": 0.005802154541015625, "lambda_div_used": 0.6, "learning_rate": 9.54457320834625e-07, "loss": 0.0209, "reward": -0.18184374831616879, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.18184374831616879, "reward_after_std": 0.799139641225338, "reward_before_mean": 0.0933333026478067, "reward_before_std": 0.7632724195718765, "reward_change_max": 0.00022216886281967163, "reward_change_mean": -0.2751770419999957, "reward_change_min": -0.5647606626152992, "reward_change_std": 0.21543886233121157, "reward_std": 0.7991396598517895, "rewards/cosine_scaled_reward": -0.14083335734903812, "rewards/format_reward": 0.3750000074505806, "step": 115 }, { "advantage_max": 1.0948162451386452, "advantage_mean": -3.1044087300813317e-09, "advantage_min": -0.5002664364874363, "advantage_std": 0.5961834639310837, "completion_length": 3541.0416870117188, "epoch": 0.13257142857142856, "grad_norm": 0.12247907370328903, "kl": 0.005596160888671875, "lambda_div_used": 0.6, "learning_rate": 9.530702921077358e-07, "loss": 0.0121, "reward": -0.3858891613781452, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3858891613781452, "reward_after_std": 0.596183467656374, "reward_before_mean": -0.17424449883401394, "reward_before_std": 0.6120402682572603, "reward_change_max": 0.0007318109273910522, "reward_change_mean": -0.21164466626942158, "reward_change_min": -0.47822466120123863, "reward_change_std": 0.19232912454754114, "reward_std": 0.596183467656374, "rewards/cosine_scaled_reward": -0.12878892198204994, "rewards/format_reward": 0.0833333358168602, "step": 116 }, { "advantage_max": 0.9450958669185638, "advantage_mean": 9.934107758624577e-09, "advantage_min": -0.503460593521595, "advantage_std": 0.5200935825705528, "completion_length": 3124.5416870117188, "epoch": 0.1337142857142857, "grad_norm": 0.08877340704202652, "kl": 0.0074920654296875, "lambda_div_used": 0.6, "learning_rate": 9.516636183034564e-07, "loss": 0.023, "reward": -0.38494894467294216, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.38494894467294216, "reward_after_std": 0.5200935900211334, "reward_before_mean": -0.15960646513849497, "reward_before_std": 0.5255400091409683, "reward_change_max": 0.002323649823665619, "reward_change_mean": -0.22534247650764883, "reward_change_min": -0.46065803617239, "reward_change_std": 0.18960520531982183, "reward_std": 0.5200935937464237, "rewards/cosine_scaled_reward": -0.20480323676019907, "rewards/format_reward": 0.25, "step": 117 }, { "advantage_max": 1.7024286799132824, "advantage_mean": -1.5522042984272844e-08, "advantage_min": -0.9134473949670792, "advantage_std": 0.9439865052700043, "completion_length": 3146.104248046875, "epoch": 0.13485714285714287, "grad_norm": 0.15422917902469635, "kl": 0.0044574737548828125, "lambda_div_used": 0.6, "learning_rate": 9.502373679810839e-07, "loss": 0.0438, "reward": 0.2010413184762001, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2010413184762001, "reward_after_std": 0.9439864829182625, "reward_before_mean": 0.6700890860520303, "reward_before_std": 0.9486183300614357, "reward_change_max": 0.0010381042957305908, "reward_change_mean": -0.4690477307885885, "reward_change_min": -0.9044231325387955, "reward_change_std": 0.3665061164647341, "reward_std": 0.9439865499734879, "rewards/cosine_scaled_reward": 0.12671118369325995, "rewards/format_reward": 0.4166666753590107, "step": 118 }, { "advantage_max": 1.1857410296797752, "advantage_mean": 6.208816238917336e-10, "advantage_min": -0.5179601944983006, "advantage_std": 0.6348244119435549, "completion_length": 2643.3125076293945, "epoch": 0.136, "grad_norm": 0.4639641046524048, "kl": 0.12084007263183594, "lambda_div_used": 0.6, "learning_rate": 9.487916106540465e-07, "loss": 0.0218, "reward": -0.005526546388864517, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.005526546388864517, "reward_after_std": 0.6348244119435549, "reward_before_mean": 0.4011272483621724, "reward_before_std": 0.563131982460618, "reward_change_max": 0.000466369092464447, "reward_change_mean": -0.40665382659062743, "reward_change_min": -0.7384549304842949, "reward_change_std": 0.2714037476107478, "reward_std": 0.6348244175314903, "rewards/cosine_scaled_reward": -0.08068636374082416, "rewards/format_reward": 0.562500013038516, "step": 119 }, { "advantage_max": 1.5117480978369713, "advantage_mean": 1.1796752574788627e-08, "advantage_min": -0.5524156130850315, "advantage_std": 0.7942009922116995, "completion_length": 2476.3541831970215, "epoch": 0.13714285714285715, "grad_norm": 0.13974298536777496, "kl": 0.0076751708984375, "lambda_div_used": 0.6, "learning_rate": 9.473264167865171e-07, "loss": 0.0168, "reward": 0.022801415994763374, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.022801415994763374, "reward_after_std": 0.7942009922116995, "reward_before_mean": 0.41292600706219673, "reward_before_std": 0.7128852717578411, "reward_change_max": 0.0, "reward_change_mean": -0.3901246301829815, "reward_change_min": -0.759309895336628, "reward_change_std": 0.2864347733557224, "reward_std": 0.7942009996622801, "rewards/cosine_scaled_reward": -0.043536994606256485, "rewards/format_reward": 0.5000000018626451, "step": 120 }, { "advantage_max": 1.3017460890114307, "advantage_mean": -3.4148494976182775e-08, "advantage_min": -0.7192301824688911, "advantage_std": 0.7437176704406738, "completion_length": 2102.958366394043, "epoch": 0.1382857142857143, "grad_norm": 0.16582335531711578, "kl": 0.013330459594726562, "lambda_div_used": 0.6, "learning_rate": 9.458418577899774e-07, "loss": 0.0745, "reward": 0.16585742961615324, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16585742961615324, "reward_after_std": 0.7437176592648029, "reward_before_mean": 0.6575608756393194, "reward_before_std": 0.7263416051864624, "reward_change_max": 0.0008957311511039734, "reward_change_mean": -0.4917034823447466, "reward_change_min": -0.8339627794921398, "reward_change_std": 0.3542322674766183, "reward_std": 0.7437177151441574, "rewards/cosine_scaled_reward": -0.004552898928523064, "rewards/format_reward": 0.6666666772216558, "step": 121 }, { "advantage_max": 1.445691742002964, "advantage_mean": -1.3038516155639002e-08, "advantage_min": -0.8504197970032692, "advantage_std": 0.8637564219534397, "completion_length": 2935.18758392334, "epoch": 0.13942857142857143, "grad_norm": 0.19797678291797638, "kl": 0.0050258636474609375, "lambda_div_used": 0.6, "learning_rate": 9.443380060197385e-07, "loss": 0.0503, "reward": 0.11646575294435024, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.11646575294435024, "reward_after_std": 0.8637563772499561, "reward_before_mean": 0.5674029514193535, "reward_before_std": 0.9499269165098667, "reward_change_max": 0.0017427802085876465, "reward_change_mean": -0.450937207788229, "reward_change_min": -0.9292780607938766, "reward_change_std": 0.3989896886050701, "reward_std": 0.8637564107775688, "rewards/cosine_scaled_reward": 0.044118134304881096, "rewards/format_reward": 0.47916667722165585, "step": 122 }, { "advantage_max": 1.3585578612983227, "advantage_mean": 6.829699028543246e-09, "advantage_min": -0.5975468009710312, "advantage_std": 0.7217418989166617, "completion_length": 3088.791702270508, "epoch": 0.14057142857142857, "grad_norm": 0.11229279637336731, "kl": 0.0062198638916015625, "lambda_div_used": 0.6, "learning_rate": 9.428149347714143e-07, "loss": 0.0001, "reward": -0.20415015192702413, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.20415015192702413, "reward_after_std": 0.721741883084178, "reward_before_mean": 0.07706481404602528, "reward_before_std": 0.7037300141528249, "reward_change_max": 0.0008754059672355652, "reward_change_mean": -0.28121496737003326, "reward_change_min": -0.5276207067072392, "reward_change_std": 0.21633497811853886, "reward_std": 0.7217418989166617, "rewards/cosine_scaled_reward": -0.13855092599987984, "rewards/format_reward": 0.3541666716337204, "step": 123 }, { "advantage_max": 1.5761494934558868, "advantage_mean": -1.676380662063437e-08, "advantage_min": -0.8289666064083576, "advantage_std": 0.9093691222369671, "completion_length": 2505.1458892822266, "epoch": 0.1417142857142857, "grad_norm": 0.1675301343202591, "kl": 0.0081329345703125, "lambda_div_used": 0.6, "learning_rate": 9.412727182773486e-07, "loss": 0.0567, "reward": 0.19109472876880318, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19109472876880318, "reward_after_std": 0.9093690924346447, "reward_before_mean": 0.6679605036042631, "reward_before_std": 0.9592476300895214, "reward_change_max": 0.00041870027780532837, "reward_change_mean": -0.4768657460808754, "reward_change_min": -0.989845547825098, "reward_change_std": 0.40578335523605347, "reward_std": 0.9093691147863865, "rewards/cosine_scaled_reward": 0.042313557118177414, "rewards/format_reward": 0.5833333395421505, "step": 124 }, { "advantage_max": 1.1822724342346191, "advantage_mean": 1.8626452047421083e-08, "advantage_min": -0.5536490008234978, "advantage_std": 0.6343522779643536, "completion_length": 2876.7291946411133, "epoch": 0.14285714285714285, "grad_norm": 0.07611420750617981, "kl": 0.00714874267578125, "lambda_div_used": 0.6, "learning_rate": 9.397114317029974e-07, "loss": 0.0114, "reward": -0.08220624923706055, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.08220624923706055, "reward_after_std": 0.6343522742390633, "reward_before_mean": 0.2853474132716656, "reward_before_std": 0.5844516009092331, "reward_change_max": 0.0006675645709037781, "reward_change_mean": -0.3675536550581455, "reward_change_min": -0.6255028396844864, "reward_change_std": 0.25118013471364975, "reward_std": 0.634352296590805, "rewards/cosine_scaled_reward": -0.0031596346525475383, "rewards/format_reward": 0.2916666679084301, "step": 125 }, { "advantage_max": 1.1261632405221462, "advantage_mean": 8.692344205529778e-09, "advantage_min": -0.5621048547327518, "advantage_std": 0.637621471658349, "completion_length": 2894.500045776367, "epoch": 0.144, "grad_norm": 0.12789134681224823, "kl": 0.00453948974609375, "lambda_div_used": 0.6, "learning_rate": 9.381311511432658e-07, "loss": 0.0183, "reward": -0.2368287304416299, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2368287304416299, "reward_after_std": 0.6376214865595102, "reward_before_mean": 0.05406346544623375, "reward_before_std": 0.6669083181768656, "reward_change_max": 0.0019116774201393127, "reward_change_mean": -0.2908922014757991, "reward_change_min": -0.6447681747376919, "reward_change_std": 0.2572120614349842, "reward_std": 0.6376215238124132, "rewards/cosine_scaled_reward": -0.191718271933496, "rewards/format_reward": 0.4375000037252903, "step": 126 }, { "advantage_max": 0.8779657371342182, "advantage_mean": 2.2662182880273107e-08, "advantage_min": -0.4594753198325634, "advantage_std": 0.4753657840192318, "completion_length": 3165.000015258789, "epoch": 0.14514285714285713, "grad_norm": 0.06951512396335602, "kl": 0.00635528564453125, "lambda_div_used": 0.6, "learning_rate": 9.36531953618799e-07, "loss": 0.0316, "reward": -0.44154438376426697, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.44154438376426697, "reward_after_std": 0.4753657877445221, "reward_before_mean": -0.23894503759220243, "reward_before_std": 0.4717731177806854, "reward_change_max": 0.0, "reward_change_mean": -0.2025993438437581, "reward_change_min": -0.4216236099600792, "reward_change_std": 0.16746219526976347, "reward_std": 0.4753657951951027, "rewards/cosine_scaled_reward": -0.24447251576930285, "rewards/format_reward": 0.2500000074505806, "step": 127 }, { "advantage_max": 1.191404215991497, "advantage_mean": -1.490116174895917e-08, "advantage_min": -0.836443617939949, "advantage_std": 0.7149866968393326, "completion_length": 2884.8958587646484, "epoch": 0.1462857142857143, "grad_norm": 0.10610302537679672, "kl": 0.006505012512207031, "lambda_div_used": 0.6, "learning_rate": 9.34913917072228e-07, "loss": 0.0369, "reward": 0.10793188214302063, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10793188214302063, "reward_after_std": 0.7149866968393326, "reward_before_mean": 0.5834506526589394, "reward_before_std": 0.7736126147210598, "reward_change_max": 0.0012885704636573792, "reward_change_mean": -0.4755187965929508, "reward_change_min": -0.8745973594486713, "reward_change_std": 0.3618140686303377, "reward_std": 0.7149867117404938, "rewards/cosine_scaled_reward": 0.09380867145955563, "rewards/format_reward": 0.3958333432674408, "step": 128 }, { "advantage_max": 1.582452967762947, "advantage_mean": 1.117587122845265e-08, "advantage_min": -0.6322049722075462, "advantage_std": 0.8493271358311176, "completion_length": 3381.8541870117188, "epoch": 0.14742857142857144, "grad_norm": 0.1458180546760559, "kl": 0.0063571929931640625, "lambda_div_used": 0.6, "learning_rate": 9.332771203643714e-07, "loss": 0.0135, "reward": -0.2717202575877309, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2717202575877309, "reward_after_std": 0.8493271190673113, "reward_before_mean": -0.05207864195108414, "reward_before_std": 0.8756519798189402, "reward_change_max": 0.0013925209641456604, "reward_change_mean": -0.2196416319347918, "reward_change_min": -0.5734671168029308, "reward_change_std": 0.22998599475249648, "reward_std": 0.8493271507322788, "rewards/cosine_scaled_reward": -0.15103933541104198, "rewards/format_reward": 0.25000000558793545, "step": 129 }, { "advantage_max": 1.3907492086291313, "advantage_mean": 6.8296989730320945e-09, "advantage_min": -0.6126744374632835, "advantage_std": 0.7462209239602089, "completion_length": 3162.6250915527344, "epoch": 0.14857142857142858, "grad_norm": 0.13115747272968292, "kl": 0.006969451904296875, "lambda_div_used": 0.6, "learning_rate": 9.316216432703916e-07, "loss": 0.0978, "reward": -0.29135762667283416, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.29135762667283416, "reward_after_std": 0.7462209612131119, "reward_before_mean": -0.0620481688529253, "reward_before_std": 0.7572699598968029, "reward_change_max": 0.0009041875600814819, "reward_change_mean": -0.2293094601482153, "reward_change_min": -0.5258476063609123, "reward_change_std": 0.20949362684041262, "reward_std": 0.7462209723889828, "rewards/cosine_scaled_reward": -0.15602409280836582, "rewards/format_reward": 0.25000000558793545, "step": 130 }, { "advantage_max": 1.4393389448523521, "advantage_mean": 1.0554989549049765e-08, "advantage_min": -0.8425508216023445, "advantage_std": 0.821558766067028, "completion_length": 2863.6250228881836, "epoch": 0.14971428571428572, "grad_norm": 0.11954847723245621, "kl": 0.008882522583007812, "lambda_div_used": 0.6, "learning_rate": 9.299475664759068e-07, "loss": 0.0408, "reward": 0.10050448589026928, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10050448589026928, "reward_after_std": 0.8215587623417377, "reward_before_mean": 0.5430004335939884, "reward_before_std": 0.8587308675050735, "reward_change_max": 0.0007584765553474426, "reward_change_mean": -0.4424959532916546, "reward_change_min": -0.8454087376594543, "reward_change_std": 0.3576465295627713, "reward_std": 0.8215587809681892, "rewards/cosine_scaled_reward": 0.07358355727046728, "rewards/format_reward": 0.3958333432674408, "step": 131 }, { "advantage_max": 1.5376378148794174, "advantage_mean": 6.208821234920947e-10, "advantage_min": -0.5504110679030418, "advantage_std": 0.8050966262817383, "completion_length": 2799.312530517578, "epoch": 0.15085714285714286, "grad_norm": 0.12419920414686203, "kl": 0.0067138671875, "lambda_div_used": 0.6, "learning_rate": 9.282549715730579e-07, "loss": 0.0195, "reward": -0.19808600842952728, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.19808600842952728, "reward_after_std": 0.8050966449081898, "reward_before_mean": 0.06645407644100487, "reward_before_std": 0.7814781591296196, "reward_change_max": 0.0012718439102172852, "reward_change_mean": -0.2645400739274919, "reward_change_min": -0.5789077877998352, "reward_change_std": 0.2121976970229298, "reward_std": 0.8050966635346413, "rewards/cosine_scaled_reward": -0.1334396367892623, "rewards/format_reward": 0.33333334140479565, "step": 132 }, { "advantage_max": 1.221783734858036, "advantage_mean": 1.924733383784627e-08, "advantage_min": -0.5688916221261024, "advantage_std": 0.6676540970802307, "completion_length": 3213.8959045410156, "epoch": 0.152, "grad_norm": 0.14848573505878448, "kl": 0.008680343627929688, "lambda_div_used": 0.6, "learning_rate": 9.265439410565328e-07, "loss": 0.0429, "reward": -0.2992970459163189, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2992970459163189, "reward_after_std": 0.6676540970802307, "reward_before_mean": -0.05562702799215913, "reward_before_std": 0.6827064771205187, "reward_change_max": 0.0013222172856330872, "reward_change_mean": -0.24367000348865986, "reward_change_min": -0.5533005259931087, "reward_change_std": 0.2156898146495223, "reward_std": 0.6676541119813919, "rewards/cosine_scaled_reward": -0.17364685283973813, "rewards/format_reward": 0.2916666716337204, "step": 133 }, { "advantage_max": 1.0847848951816559, "advantage_mean": 1.1796752852344383e-08, "advantage_min": -0.6085241474211216, "advantage_std": 0.6005263235419989, "completion_length": 2500.0833740234375, "epoch": 0.15314285714285714, "grad_norm": 0.11016670614480972, "kl": 0.00949859619140625, "lambda_div_used": 0.6, "learning_rate": 9.248145583195447e-07, "loss": -0.0017, "reward": 0.05842417012900114, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.05842417012900114, "reward_after_std": 0.6005263309925795, "reward_before_mean": 0.5181667357683182, "reward_before_std": 0.5441521927714348, "reward_change_max": 0.0009170100092887878, "reward_change_mean": -0.4597425349056721, "reward_change_min": -0.7528548240661621, "reward_change_std": 0.2978973565623164, "reward_std": 0.6005263365805149, "rewards/cosine_scaled_reward": -0.032583314925432205, "rewards/format_reward": 0.5833333432674408, "step": 134 }, { "advantage_max": 1.4369381442666054, "advantage_mean": -1.6142925440831846e-08, "advantage_min": -0.8640343248844147, "advantage_std": 0.8443533703684807, "completion_length": 2017.4166946411133, "epoch": 0.15428571428571428, "grad_norm": 0.20822252333164215, "kl": 0.007670402526855469, "lambda_div_used": 0.6, "learning_rate": 9.230669076497687e-07, "loss": 0.0579, "reward": 0.5198374316096306, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.5198374316096306, "reward_after_std": 0.8443533480167389, "reward_before_mean": 1.1967530995607376, "reward_before_std": 0.8075528107583523, "reward_change_max": 0.0, "reward_change_mean": -0.6769156903028488, "reward_change_min": -1.1189482659101486, "reward_change_std": 0.4737038407474756, "reward_std": 0.8443533927202225, "rewards/cosine_scaled_reward": 0.2650432363152504, "rewards/format_reward": 0.666666679084301, "step": 135 }, { "advantage_max": 1.6212140694260597, "advantage_mean": -3.7252899653950067e-09, "advantage_min": -0.9679715968668461, "advantage_std": 0.9402196696028113, "completion_length": 2955.5208740234375, "epoch": 0.15542857142857142, "grad_norm": 0.15110325813293457, "kl": 0.009227752685546875, "lambda_div_used": 0.6, "learning_rate": 9.213010742252327e-07, "loss": 0.0592, "reward": 0.1325805252417922, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1325805252417922, "reward_after_std": 0.9402196817100048, "reward_before_mean": 0.5687055559828877, "reward_before_std": 1.0172571474686265, "reward_change_max": 0.0006020441651344299, "reward_change_mean": -0.4361250060610473, "reward_change_min": -0.97487448528409, "reward_change_std": 0.39161210390739143, "reward_std": 0.940219696611166, "rewards/cosine_scaled_reward": 0.0656027642544359, "rewards/format_reward": 0.43750001676380634, "step": 136 }, { "advantage_max": 1.3895707875490189, "advantage_mean": 3.1044086745701804e-09, "advantage_min": -0.5488235056400299, "advantage_std": 0.7214406877756119, "completion_length": 2908.4792098999023, "epoch": 0.15657142857142858, "grad_norm": 0.12555868923664093, "kl": 0.007396697998046875, "lambda_div_used": 0.6, "learning_rate": 9.195171441101668e-07, "loss": 0.041, "reward": -0.27986384340329096, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.27986384340329096, "reward_after_std": 0.7214406877756119, "reward_before_mean": -0.0438942676410079, "reward_before_std": 0.6932445093989372, "reward_change_max": 0.00017392635345458984, "reward_change_mean": -0.23596958350390196, "reward_change_min": -0.4737580604851246, "reward_change_std": 0.18140669167041779, "reward_std": 0.7214406877756119, "rewards/cosine_scaled_reward": -0.23028047289699316, "rewards/format_reward": 0.4166666716337204, "step": 137 }, { "advantage_max": 1.1234722062945366, "advantage_mean": -4.9670538238011375e-09, "advantage_min": -0.522578340023756, "advantage_std": 0.5962201803922653, "completion_length": 2548.2709197998047, "epoch": 0.15771428571428572, "grad_norm": 0.07525104284286499, "kl": 0.0075054168701171875, "lambda_div_used": 0.6, "learning_rate": 9.177152042508077e-07, "loss": 0.0393, "reward": -0.03399332519620657, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.03399332519620657, "reward_after_std": 0.5962201841175556, "reward_before_mean": 0.36882060393691063, "reward_before_std": 0.5129054486751556, "reward_change_max": 0.0, "reward_change_mean": -0.402813920751214, "reward_change_min": -0.652053989470005, "reward_change_std": 0.2527422411367297, "reward_std": 0.5962201952934265, "rewards/cosine_scaled_reward": -0.10725637432187796, "rewards/format_reward": 0.5833333432674408, "step": 138 }, { "advantage_max": 1.276498556137085, "advantage_mean": 9.313225912688239e-09, "advantage_min": -0.7927935421466827, "advantage_std": 0.754135251045227, "completion_length": 3237.8958740234375, "epoch": 0.15885714285714286, "grad_norm": 0.15498626232147217, "kl": 0.0106353759765625, "lambda_div_used": 0.6, "learning_rate": 9.158953424711624e-07, "loss": 0.0157, "reward": 0.017847408074885607, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.017847408074885607, "reward_after_std": 0.7541352808475494, "reward_before_mean": 0.4330185279250145, "reward_before_std": 0.8115034103393555, "reward_change_max": 0.00047279149293899536, "reward_change_mean": -0.415171118453145, "reward_change_min": -0.8247731365263462, "reward_change_std": 0.3465948710218072, "reward_std": 0.7541352920234203, "rewards/cosine_scaled_reward": -0.002240734174847603, "rewards/format_reward": 0.4375000149011612, "step": 139 }, { "advantage_max": 1.0571947619318962, "advantage_mean": 1.800557003495129e-08, "advantage_min": -0.43688103556632996, "advantage_std": 0.5635790098458529, "completion_length": 3024.5209045410156, "epoch": 0.16, "grad_norm": 0.12986275553703308, "kl": 0.01467132568359375, "lambda_div_used": 0.6, "learning_rate": 9.140576474687263e-07, "loss": 0.0497, "reward": -0.24928517825901508, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.24928517825901508, "reward_after_std": 0.563579011708498, "reward_before_mean": 0.0354112945497036, "reward_before_std": 0.5041212420910597, "reward_change_max": 0.0016125962138175964, "reward_change_mean": -0.2846964537166059, "reward_change_min": -0.482378251850605, "reward_change_std": 0.19983111508190632, "reward_std": 0.5635790284723043, "rewards/cosine_scaled_reward": -0.12812769412994385, "rewards/format_reward": 0.2916666753590107, "step": 140 }, { "advantage_max": 1.9671744778752327, "advantage_mean": -6.208817349140361e-09, "advantage_min": -0.89418925344944, "advantage_std": 1.0909193567931652, "completion_length": 2874.104232788086, "epoch": 0.16114285714285714, "grad_norm": 0.16195128858089447, "kl": 0.014141082763671875, "lambda_div_used": 0.6, "learning_rate": 9.122022088101613e-07, "loss": 0.0402, "reward": 0.2075315216789022, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2075315216789022, "reward_after_std": 1.0909193120896816, "reward_before_mean": 0.6503730427939445, "reward_before_std": 1.1303999871015549, "reward_change_max": 0.0007979348301887512, "reward_change_mean": -0.4428415335714817, "reward_change_min": -0.9751441776752472, "reward_change_std": 0.3947232998907566, "reward_std": 1.090919341892004, "rewards/cosine_scaled_reward": 0.033519853837788105, "rewards/format_reward": 0.5833333432674408, "step": 141 }, { "advantage_max": 1.5644628405570984, "advantage_mean": -7.140139812733537e-09, "advantage_min": -0.9668753743171692, "advantage_std": 0.8950626216828823, "completion_length": 2810.8334197998047, "epoch": 0.16228571428571428, "grad_norm": 0.1719711571931839, "kl": 0.01021575927734375, "lambda_div_used": 0.6, "learning_rate": 9.103291169269299e-07, "loss": 0.1232, "reward": 0.15125958062708378, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.15125958062708378, "reward_after_std": 0.8950626142323017, "reward_before_mean": 0.6069081444293261, "reward_before_std": 0.9450072459876537, "reward_change_max": 0.0012427493929862976, "reward_change_mean": -0.4556485563516617, "reward_change_min": -0.8686264418065548, "reward_change_std": 0.37062836065888405, "reward_std": 0.8950626626610756, "rewards/cosine_scaled_reward": 0.0013707317411899567, "rewards/format_reward": 0.6041666846722364, "step": 142 }, { "advantage_max": 1.0218571051955223, "advantage_mean": 2.483526928553914e-08, "advantage_min": -0.4859629459679127, "advantage_std": 0.5451667793095112, "completion_length": 2720.604202270508, "epoch": 0.16342857142857142, "grad_norm": 0.19208702445030212, "kl": 0.013195037841796875, "lambda_div_used": 0.6, "learning_rate": 9.084384631108882e-07, "loss": 0.0667, "reward": -0.3548990674316883, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3548990674316883, "reward_after_std": 0.5451667718589306, "reward_before_mean": -0.12133161188103259, "reward_before_std": 0.5252682417631149, "reward_change_max": 0.0, "reward_change_mean": -0.23356744460761547, "reward_change_min": -0.471222460269928, "reward_change_std": 0.18014120031148195, "reward_std": 0.5451667830348015, "rewards/cosine_scaled_reward": -0.2377491444349289, "rewards/format_reward": 0.35416666977107525, "step": 143 }, { "advantage_max": 1.4134826138615608, "advantage_mean": 6.208817349140361e-09, "advantage_min": -0.6104103177785873, "advantage_std": 0.7537709623575211, "completion_length": 2884.5833740234375, "epoch": 0.16457142857142856, "grad_norm": 0.11878510564565659, "kl": 0.0088043212890625, "lambda_div_used": 0.6, "learning_rate": 9.065303395098358e-07, "loss": 0.0401, "reward": -0.055001866072416306, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.055001866072416306, "reward_after_std": 0.7537709586322308, "reward_before_mean": 0.3030646312981844, "reward_before_std": 0.7174690030515194, "reward_change_max": 0.0010922551155090332, "reward_change_mean": -0.3580665346235037, "reward_change_min": -0.6770821623504162, "reward_change_std": 0.2684730626642704, "reward_std": 0.7537709660828114, "rewards/cosine_scaled_reward": -0.03596767038106918, "rewards/format_reward": 0.37500000186264515, "step": 144 }, { "advantage_max": 1.1855008229613304, "advantage_mean": -3.104408619059029e-09, "advantage_min": -0.6026959903538227, "advantage_std": 0.6411144360899925, "completion_length": 2345.3125381469727, "epoch": 0.1657142857142857, "grad_norm": 0.11563977599143982, "kl": 0.01104736328125, "lambda_div_used": 0.6, "learning_rate": 9.046048391230247e-07, "loss": 0.0254, "reward": 0.12009605206549168, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12009605206549168, "reward_after_std": 0.6411144360899925, "reward_before_mean": 0.5996555439196527, "reward_before_std": 0.5601776875555515, "reward_change_max": 0.0, "reward_change_mean": -0.47955949790775776, "reward_change_min": -0.8012478165328503, "reward_change_std": 0.30504490062594414, "reward_std": 0.6411144435405731, "rewards/cosine_scaled_reward": -0.0022555720061063766, "rewards/format_reward": 0.6041666753590107, "step": 145 }, { "advantage_max": 1.2408719435334206, "advantage_mean": 1.8936893220189432e-08, "advantage_min": -0.7066081799566746, "advantage_std": 0.7005535922944546, "completion_length": 2367.187545776367, "epoch": 0.16685714285714287, "grad_norm": 0.19457805156707764, "kl": 0.0077648162841796875, "lambda_div_used": 0.6, "learning_rate": 9.026620557966279e-07, "loss": 0.0806, "reward": -0.08792629465460777, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08792629465460777, "reward_after_std": 0.7005535922944546, "reward_before_mean": 0.2676168754696846, "reward_before_std": 0.7161291688680649, "reward_change_max": 0.0023860037326812744, "reward_change_mean": -0.35554315708577633, "reward_change_min": -0.7299299165606499, "reward_change_std": 0.2929388973861933, "reward_std": 0.7005536258220673, "rewards/cosine_scaled_reward": -0.15785823203623295, "rewards/format_reward": 0.5833333414047956, "step": 146 }, { "advantage_max": 1.4107197225093842, "advantage_mean": 6.208817571184966e-09, "advantage_min": -0.657046489417553, "advantage_std": 0.7767920307815075, "completion_length": 2868.625015258789, "epoch": 0.168, "grad_norm": 0.14225320518016815, "kl": 0.0139923095703125, "lambda_div_used": 0.6, "learning_rate": 9.007020842191634e-07, "loss": 0.0227, "reward": -0.1404053345322609, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1404053345322609, "reward_after_std": 0.7767920270562172, "reward_before_mean": 0.1694311499595642, "reward_before_std": 0.7943050377070904, "reward_change_max": 0.00027482956647872925, "reward_change_mean": -0.3098364733159542, "reward_change_min": -0.7106362171471119, "reward_change_std": 0.2728282855823636, "reward_std": 0.7767920419573784, "rewards/cosine_scaled_reward": -0.09236776456236839, "rewards/format_reward": 0.3541666753590107, "step": 147 }, { "advantage_max": 1.4902683272957802, "advantage_mean": -8.692344288796505e-09, "advantage_min": -0.6846123561263084, "advantage_std": 0.8118891194462776, "completion_length": 2573.291732788086, "epoch": 0.16914285714285715, "grad_norm": 0.12192974984645844, "kl": 0.012142181396484375, "lambda_div_used": 0.6, "learning_rate": 8.987250199168808e-07, "loss": 0.0225, "reward": 0.1568167768418789, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1568167768418789, "reward_after_std": 0.8118891008198261, "reward_before_mean": 0.6254363059997559, "reward_before_std": 0.7636519968509674, "reward_change_max": 0.0007330328226089478, "reward_change_mean": -0.4686195347458124, "reward_change_min": -0.8460842408239841, "reward_change_std": 0.32862727902829647, "reward_std": 0.8118891268968582, "rewards/cosine_scaled_reward": 0.021051467396318913, "rewards/format_reward": 0.5833333414047956, "step": 148 }, { "advantage_max": 1.4633513800799847, "advantage_mean": 1.0554989549049765e-08, "advantage_min": -0.6385081559419632, "advantage_std": 0.7907122951000929, "completion_length": 2929.1458740234375, "epoch": 0.1702857142857143, "grad_norm": 0.13244077563285828, "kl": 0.01035308837890625, "lambda_div_used": 0.6, "learning_rate": 8.967309592491052e-07, "loss": 0.0284, "reward": -0.1310775459278375, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1310775459278375, "reward_after_std": 0.7907122801989317, "reward_before_mean": 0.18053598888218403, "reward_before_std": 0.7945210449397564, "reward_change_max": 0.0018289387226104736, "reward_change_mean": -0.31161350570619106, "reward_change_min": -0.6495312862098217, "reward_change_std": 0.26187097700312734, "reward_std": 0.7907122857868671, "rewards/cosine_scaled_reward": -0.12848201533779502, "rewards/format_reward": 0.4375000074505806, "step": 149 }, { "advantage_max": 1.4026121124625206, "advantage_mean": -3.725290298461914e-09, "advantage_min": -0.9239209443330765, "advantage_std": 0.8353138975799084, "completion_length": 2977.812515258789, "epoch": 0.17142857142857143, "grad_norm": 0.17636874318122864, "kl": 0.0157928466796875, "lambda_div_used": 0.6, "learning_rate": 8.9471999940354e-07, "loss": 0.0718, "reward": 0.16062995791435242, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16062995791435242, "reward_after_std": 0.8353139087557793, "reward_before_mean": 0.6395070925354958, "reward_before_std": 0.9086871258914471, "reward_change_max": 0.0, "reward_change_mean": -0.47887711599469185, "reward_change_min": -0.9224202036857605, "reward_change_std": 0.38951124995946884, "reward_std": 0.8353139348328114, "rewards/cosine_scaled_reward": 0.11142019368708134, "rewards/format_reward": 0.4166666753590107, "step": 150 }, { "advantage_max": 1.6562565043568611, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.8369650840759277, "advantage_std": 0.9075136668980122, "completion_length": 2601.812545776367, "epoch": 0.17257142857142857, "grad_norm": 0.20402175188064575, "kl": 0.01294708251953125, "lambda_div_used": 0.6, "learning_rate": 8.926922383915315e-07, "loss": 0.0533, "reward": 0.19714290462434292, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19714290462434292, "reward_after_std": 0.9075136668980122, "reward_before_mean": 0.6664761131396517, "reward_before_std": 0.8824147135019302, "reward_change_max": 0.00037372857332229614, "reward_change_mean": -0.4693332202732563, "reward_change_min": -0.8574397899210453, "reward_change_std": 0.35109512601047754, "reward_std": 0.9075136780738831, "rewards/cosine_scaled_reward": 0.05198805220425129, "rewards/format_reward": 0.5625000149011612, "step": 151 }, { "advantage_max": 1.0910377725958824, "advantage_mean": 1.2417634698280722e-08, "advantage_min": -0.5008981414139271, "advantage_std": 0.5850908644497395, "completion_length": 2868.7916984558105, "epoch": 0.1737142857142857, "grad_norm": 0.1328096240758896, "kl": 0.016147613525390625, "lambda_div_used": 0.6, "learning_rate": 8.906477750432903e-07, "loss": 0.0549, "reward": -0.28469391725957394, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.28469391725957394, "reward_after_std": 0.5850908793509007, "reward_before_mean": -0.020222843624651432, "reward_before_std": 0.563703091815114, "reward_change_max": 0.0006050914525985718, "reward_change_mean": -0.2644710736349225, "reward_change_min": -0.48153096437454224, "reward_change_std": 0.19808495230972767, "reward_std": 0.5850908905267715, "rewards/cosine_scaled_reward": -0.16636142708011903, "rewards/format_reward": 0.31250000186264515, "step": 152 }, { "advantage_max": 1.010260485112667, "advantage_mean": 9.313225690643634e-09, "advantage_min": -0.5362804085016251, "advantage_std": 0.5542087778449059, "completion_length": 2930.8750610351562, "epoch": 0.17485714285714285, "grad_norm": 0.10720986872911453, "kl": 0.021121978759765625, "lambda_div_used": 0.6, "learning_rate": 8.88586709003076e-07, "loss": 0.043, "reward": -0.3041149973869324, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3041149973869324, "reward_after_std": 0.5542087741196156, "reward_before_mean": -0.0405734870582819, "reward_before_std": 0.5564620681107044, "reward_change_max": 0.001681201159954071, "reward_change_mean": -0.2635415093973279, "reward_change_min": -0.5015093982219696, "reward_change_std": 0.20337554719299078, "reward_std": 0.5542087815701962, "rewards/cosine_scaled_reward": -0.18695341609418392, "rewards/format_reward": 0.33333334140479565, "step": 153 }, { "advantage_max": 1.5810632705688477, "advantage_mean": -3.1044086745701804e-09, "advantage_min": -0.8457796424627304, "advantage_std": 0.9236548617482185, "completion_length": 3452.8333740234375, "epoch": 0.176, "grad_norm": 0.1580066680908203, "kl": 0.012538909912109375, "lambda_div_used": 0.6, "learning_rate": 8.865091407243394e-07, "loss": 0.0571, "reward": -0.070727514103055, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.070727514103055, "reward_after_std": 0.9236548617482185, "reward_before_mean": 0.2621636036783457, "reward_before_std": 1.036940049380064, "reward_change_max": 0.001263946294784546, "reward_change_mean": -0.3328911308199167, "reward_change_min": -0.8280980698764324, "reward_change_std": 0.36182656791061163, "reward_std": 0.9236549139022827, "rewards/cosine_scaled_reward": -0.004334868863224983, "rewards/format_reward": 0.2708333395421505, "step": 154 }, { "advantage_max": 1.1857907101511955, "advantage_mean": 8.071462720415923e-09, "advantage_min": -0.5334465727210045, "advantage_std": 0.6514654830098152, "completion_length": 2667.937545776367, "epoch": 0.17714285714285713, "grad_norm": 0.10593610256910324, "kl": 0.012603759765625, "lambda_div_used": 0.6, "learning_rate": 8.844151714648274e-07, "loss": 0.0209, "reward": 0.15297628194093704, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.15297628194093704, "reward_after_std": 0.6514654830098152, "reward_before_mean": 0.647703853668645, "reward_before_std": 0.570495992898941, "reward_change_max": 0.0, "reward_change_mean": -0.4947275333106518, "reward_change_min": -0.8316370770335197, "reward_change_std": 0.32937124744057655, "reward_std": 0.6514654979109764, "rewards/cosine_scaled_reward": 0.07385189644992352, "rewards/format_reward": 0.5000000018626451, "step": 155 }, { "advantage_max": 1.4448847994208336, "advantage_mean": 9.934107980669182e-09, "advantage_min": -0.723850317299366, "advantage_std": 0.7949776016175747, "completion_length": 3009.9583587646484, "epoch": 0.1782857142857143, "grad_norm": 0.12247823923826218, "kl": 0.013317108154296875, "lambda_div_used": 0.6, "learning_rate": 8.823049032816478e-07, "loss": 0.0451, "reward": -0.10736257396638393, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10736257396638393, "reward_after_std": 0.7949775978922844, "reward_before_mean": 0.2161726988852024, "reward_before_std": 0.8142803534865379, "reward_change_max": 0.001227453351020813, "reward_change_mean": -0.3235352849587798, "reward_change_min": -0.6421325244009495, "reward_change_std": 0.2678525187075138, "reward_std": 0.7949776314198971, "rewards/cosine_scaled_reward": -0.05858031287789345, "rewards/format_reward": 0.3333333469927311, "step": 156 }, { "advantage_max": 0.8361185044050217, "advantage_mean": 1.1175871117430347e-08, "advantage_min": -0.5154761485755444, "advantage_std": 0.48633952997624874, "completion_length": 3123.854248046875, "epoch": 0.17942857142857144, "grad_norm": 0.09848199039697647, "kl": 0.017620086669921875, "lambda_div_used": 0.6, "learning_rate": 8.801784390262943e-07, "loss": 0.037, "reward": -0.2843034298857674, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2843034298857674, "reward_after_std": 0.48633952997624874, "reward_before_mean": 0.007725675590336323, "reward_before_std": 0.5010611899197102, "reward_change_max": 0.0008352473378181458, "reward_change_mean": -0.2920291116461158, "reward_change_min": -0.5376311354339123, "reward_change_std": 0.22594552859663963, "reward_std": 0.48633954487740993, "rewards/cosine_scaled_reward": -0.18363716267049313, "rewards/format_reward": 0.37500000558793545, "step": 157 }, { "advantage_max": 1.601955957710743, "advantage_mean": -1.3659398390153399e-08, "advantage_min": -0.7452918365597725, "advantage_std": 0.8648084662854671, "completion_length": 3046.4583892822266, "epoch": 0.18057142857142858, "grad_norm": 0.18506969511508942, "kl": 0.013607025146484375, "lambda_div_used": 0.6, "learning_rate": 8.780358823396352e-07, "loss": 0.0621, "reward": 0.21542098745703697, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21542098745703697, "reward_after_std": 0.8648084886372089, "reward_before_mean": 0.7011261153966188, "reward_before_std": 0.824807170778513, "reward_change_max": 0.0014419779181480408, "reward_change_mean": -0.4857051018625498, "reward_change_min": -0.9043072685599327, "reward_change_std": 0.3465930465608835, "reward_std": 0.8648085445165634, "rewards/cosine_scaled_reward": 0.11097969580441713, "rewards/format_reward": 0.47916667722165585, "step": 158 }, { "advantage_max": 1.0933760181069374, "advantage_mean": 2.359350542713301e-08, "advantage_min": -0.519902590662241, "advantage_std": 0.5967991352081299, "completion_length": 3206.4166870117188, "epoch": 0.18171428571428572, "grad_norm": 0.09166071563959122, "kl": 0.0195465087890625, "lambda_div_used": 0.6, "learning_rate": 8.758773376468604e-07, "loss": 0.0258, "reward": -0.2743441015481949, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2743441015481949, "reward_after_std": 0.5967991352081299, "reward_before_mean": -0.003233685391023755, "reward_before_std": 0.5897813513875008, "reward_change_max": 0.0, "reward_change_mean": -0.27111040614545345, "reward_change_min": -0.5754702016711235, "reward_change_std": 0.21552940551191568, "reward_std": 0.5967991426587105, "rewards/cosine_scaled_reward": -0.13703351188451052, "rewards/format_reward": 0.2708333432674408, "step": 159 }, { "advantage_max": 1.1551932729780674, "advantage_mean": 2.5456150964942026e-08, "advantage_min": -0.7090373113751411, "advantage_std": 0.635726273059845, "completion_length": 2818.000015258789, "epoch": 0.18285714285714286, "grad_norm": 0.10902596265077591, "kl": 0.021236419677734375, "lambda_div_used": 0.6, "learning_rate": 8.737029101523929e-07, "loss": -0.0041, "reward": 0.001515369862318039, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.001515369862318039, "reward_after_std": 0.6357262693345547, "reward_before_mean": 0.4176196716725826, "reward_before_std": 0.6051103845238686, "reward_change_max": 0.00033104419708251953, "reward_change_mean": -0.4161042859777808, "reward_change_min": -0.6843418888747692, "reward_change_std": 0.27768101543188095, "reward_std": 0.635726273059845, "rewards/cosine_scaled_reward": -0.009940192103385925, "rewards/format_reward": 0.4375000074505806, "step": 160 }, { "advantage_max": 1.213698647916317, "advantage_mean": -3.7252901874396116e-09, "advantage_min": -0.552368201315403, "advantage_std": 0.6527365148067474, "completion_length": 2969.9166870117188, "epoch": 0.184, "grad_norm": 0.17307765781879425, "kl": 0.02019500732421875, "lambda_div_used": 0.6, "learning_rate": 8.715127058347614e-07, "loss": 0.0628, "reward": -0.10974358767271042, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10974358767271042, "reward_after_std": 0.652736522257328, "reward_before_mean": 0.23834371659904718, "reward_before_std": 0.6073960103094578, "reward_change_max": 0.000466175377368927, "reward_change_mean": -0.34808731684461236, "reward_change_min": -0.6483714245259762, "reward_change_std": 0.25496606389060616, "reward_std": 0.6527365408837795, "rewards/cosine_scaled_reward": -0.05791149102151394, "rewards/format_reward": 0.3541666753590107, "step": 161 }, { "advantage_max": 1.5892940908670425, "advantage_mean": 8.692344621863413e-09, "advantage_min": -0.5379619300365448, "advantage_std": 0.8178718462586403, "completion_length": 3311.2083435058594, "epoch": 0.18514285714285714, "grad_norm": 0.16470083594322205, "kl": 0.02435302734375, "lambda_div_used": 0.6, "learning_rate": 8.693068314414344e-07, "loss": 0.012, "reward": -0.2619167543016374, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2619167543016374, "reward_after_std": 0.8178718611598015, "reward_before_mean": -0.0391325494274497, "reward_before_std": 0.788720153272152, "reward_change_max": 0.0004187747836112976, "reward_change_mean": -0.2227841846179217, "reward_change_min": -0.5147033594548702, "reward_change_std": 0.18670457205735147, "reward_std": 0.8178718686103821, "rewards/cosine_scaled_reward": -0.12373294867575169, "rewards/format_reward": 0.2083333358168602, "step": 162 }, { "advantage_max": 1.5184885412454605, "advantage_mean": -5.277494746769307e-09, "advantage_min": -0.7798839248716831, "advantage_std": 0.8345548771321774, "completion_length": 2602.625045776367, "epoch": 0.18628571428571428, "grad_norm": 0.15418756008148193, "kl": 0.018619537353515625, "lambda_div_used": 0.6, "learning_rate": 8.670853944836176e-07, "loss": 0.0306, "reward": 0.09741232171654701, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09741232171654701, "reward_after_std": 0.8345548585057259, "reward_before_mean": 0.5306345459539443, "reward_before_std": 0.8378158137202263, "reward_change_max": 0.0007125288248062134, "reward_change_mean": -0.43322221748530865, "reward_change_min": -0.800242580473423, "reward_change_std": 0.31650349078699946, "reward_std": 0.8345548957586288, "rewards/cosine_scaled_reward": -0.005516069009900093, "rewards/format_reward": 0.5416666772216558, "step": 163 }, { "advantage_max": 1.150291495025158, "advantage_mean": -5.587935503204022e-09, "advantage_min": -0.7518020123243332, "advantage_std": 0.6527398601174355, "completion_length": 2719.7708892822266, "epoch": 0.18742857142857142, "grad_norm": 0.1282048225402832, "kl": 0.020420074462890625, "lambda_div_used": 0.6, "learning_rate": 8.648485032310144e-07, "loss": 0.0287, "reward": 0.05534203629940748, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.05534203629940748, "reward_after_std": 0.6527398601174355, "reward_before_mean": 0.503211950417608, "reward_before_std": 0.6376418210566044, "reward_change_max": 0.0012490972876548767, "reward_change_mean": -0.44786988385021687, "reward_change_min": -0.7799138650298119, "reward_change_std": 0.3177740080282092, "reward_std": 0.6527398899197578, "rewards/cosine_scaled_reward": 0.001605958677828312, "rewards/format_reward": 0.5000000167638063, "step": 164 }, { "advantage_max": 1.3831028826534748, "advantage_mean": 7.45058070794613e-09, "advantage_min": -0.6921538636088371, "advantage_std": 0.7546810247004032, "completion_length": 3058.7291870117188, "epoch": 0.18857142857142858, "grad_norm": 0.15882042050361633, "kl": 0.0272674560546875, "lambda_div_used": 0.6, "learning_rate": 8.625962667065487e-07, "loss": 0.0616, "reward": -0.1579680386930704, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1579680386930704, "reward_after_std": 0.754681009799242, "reward_before_mean": 0.14193058013916016, "reward_before_std": 0.762186162173748, "reward_change_max": 0.0010264962911605835, "reward_change_mean": -0.2998986216261983, "reward_change_min": -0.621304091066122, "reward_change_std": 0.25608566775918007, "reward_std": 0.7546810209751129, "rewards/cosine_scaled_reward": -0.10611804574728012, "rewards/format_reward": 0.35416668094694614, "step": 165 }, { "advantage_max": 1.441053494811058, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.8313321396708488, "advantage_std": 0.8061891719698906, "completion_length": 3003.104217529297, "epoch": 0.18971428571428572, "grad_norm": 0.13318245112895966, "kl": 0.01674652099609375, "lambda_div_used": 0.6, "learning_rate": 8.603287946810513e-07, "loss": 0.0304, "reward": -0.012070178985595703, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.012070178985595703, "reward_after_std": 0.8061891719698906, "reward_before_mean": 0.3683839440345764, "reward_before_std": 0.8240198995918036, "reward_change_max": 0.002330496907234192, "reward_change_mean": -0.38045413699001074, "reward_change_min": -0.7002588920295238, "reward_change_std": 0.3043457716703415, "reward_std": 0.8061891756951809, "rewards/cosine_scaled_reward": -0.03455802670214325, "rewards/format_reward": 0.43750001303851604, "step": 166 }, { "advantage_max": 1.423269435763359, "advantage_mean": -3.104408619059029e-09, "advantage_min": -0.7583351470530033, "advantage_std": 0.8086444661021233, "completion_length": 2296.2709045410156, "epoch": 0.19085714285714286, "grad_norm": 0.1586252748966217, "kl": 0.01633453369140625, "lambda_div_used": 0.6, "learning_rate": 8.580461976679099e-07, "loss": 0.0376, "reward": 0.12070143315941095, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.12070143315941095, "reward_after_std": 0.8086444847285748, "reward_before_mean": 0.5716481516137719, "reward_before_std": 0.8213043101131916, "reward_change_max": 0.004665866494178772, "reward_change_mean": -0.45094671472907066, "reward_change_min": -0.8644659072160721, "reward_change_std": 0.349481089040637, "reward_std": 0.808644488453865, "rewards/cosine_scaled_reward": -0.0995926121249795, "rewards/format_reward": 0.7708333432674408, "step": 167 }, { "advantage_max": 1.5459800511598587, "advantage_mean": -1.4901161193847656e-08, "advantage_min": -0.823444951325655, "advantage_std": 0.8288298845291138, "completion_length": 3093.0834045410156, "epoch": 0.192, "grad_norm": 0.15506742894649506, "kl": 0.0215911865234375, "lambda_div_used": 0.6, "learning_rate": 8.557485869176825e-07, "loss": 0.0362, "reward": 0.10060895385686308, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.10060895385686308, "reward_after_std": 0.828829899430275, "reward_before_mean": 0.5286179166287184, "reward_before_std": 0.7910416908562183, "reward_change_max": 0.0007976368069648743, "reward_change_mean": -0.4280089922249317, "reward_change_min": -0.7011307924985886, "reward_change_std": 0.29120912682265043, "reward_std": 0.8288299404084682, "rewards/cosine_scaled_reward": 0.02472563646733761, "rewards/format_reward": 0.47916668094694614, "step": 168 }, { "advantage_max": 1.312043160200119, "advantage_mean": -3.2285850604107935e-08, "advantage_min": -0.5426613911986351, "advantage_std": 0.7060335651040077, "completion_length": 2461.9792404174805, "epoch": 0.19314285714285714, "grad_norm": 0.12502697110176086, "kl": 0.02597808837890625, "lambda_div_used": 0.6, "learning_rate": 8.534360744126753e-07, "loss": 0.0444, "reward": 0.4141941964626312, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4141941964626312, "reward_after_std": 0.7060335800051689, "reward_before_mean": 1.0458204597234726, "reward_before_std": 0.5466500017791986, "reward_change_max": 0.0015628039836883545, "reward_change_mean": -0.6316262576729059, "reward_change_min": -1.0269163250923157, "reward_change_std": 0.3907633125782013, "reward_std": 0.7060335874557495, "rewards/cosine_scaled_reward": 0.21041020873235539, "rewards/format_reward": 0.6250000037252903, "step": 169 }, { "advantage_max": 1.2720028311014175, "advantage_mean": 2.173086055545781e-08, "advantage_min": -0.5567828081548214, "advantage_std": 0.6729303523898125, "completion_length": 2670.395866394043, "epoch": 0.19428571428571428, "grad_norm": 0.10678170621395111, "kl": 0.018474578857421875, "lambda_div_used": 0.6, "learning_rate": 8.511087728614862e-07, "loss": 0.0553, "reward": -0.009291424183174968, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.009291424183174968, "reward_after_std": 0.6729303747415543, "reward_before_mean": 0.3892105184495449, "reward_before_std": 0.599676800891757, "reward_change_max": 0.0010467469692230225, "reward_change_mean": -0.3985019223764539, "reward_change_min": -0.6365118287503719, "reward_change_std": 0.2544218208640814, "reward_std": 0.6729303784668446, "rewards/cosine_scaled_reward": -0.013728078454732895, "rewards/format_reward": 0.41666666977107525, "step": 170 }, { "advantage_max": 1.4900328442454338, "advantage_mean": -7.450581041013038e-09, "advantage_min": -0.8795114979147911, "advantage_std": 0.8389566726982594, "completion_length": 2690.375030517578, "epoch": 0.19542857142857142, "grad_norm": 0.16565613448619843, "kl": 0.01981353759765625, "lambda_div_used": 0.6, "learning_rate": 8.487667956935087e-07, "loss": 0.0466, "reward": 0.06884616613388062, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06884616613388062, "reward_after_std": 0.8389566875994205, "reward_before_mean": 0.48622291162610054, "reward_before_std": 0.8658607117831707, "reward_change_max": 0.00112181156873703, "reward_change_mean": -0.4173767250031233, "reward_change_min": -0.739656388759613, "reward_change_std": 0.32057770155370235, "reward_std": 0.8389566913247108, "rewards/cosine_scaled_reward": 0.02436142461374402, "rewards/format_reward": 0.43750001303851604, "step": 171 }, { "advantage_max": 1.3270131349563599, "advantage_mean": 1.1796752796833232e-08, "advantage_min": -0.7321847081184387, "advantage_std": 0.7408189624547958, "completion_length": 2971.708335876465, "epoch": 0.19657142857142856, "grad_norm": 0.15071731805801392, "kl": 0.0283050537109375, "lambda_div_used": 0.6, "learning_rate": 8.464102570534061e-07, "loss": 0.0236, "reward": -0.14135470986366272, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.14135470986366272, "reward_after_std": 0.7408189624547958, "reward_before_mean": 0.1792165581136942, "reward_before_std": 0.7741873655468225, "reward_change_max": 0.001958996057510376, "reward_change_mean": -0.32057126238942146, "reward_change_min": -0.6587401293218136, "reward_change_std": 0.26787589583545923, "reward_std": 0.7408190034329891, "rewards/cosine_scaled_reward": -0.06664173398166895, "rewards/format_reward": 0.31250000558793545, "step": 172 }, { "advantage_max": 1.5405456945300102, "advantage_mean": 6.208817682207268e-09, "advantage_min": -0.763243917375803, "advantage_std": 0.868730153888464, "completion_length": 2413.8333854675293, "epoch": 0.1977142857142857, "grad_norm": 0.21439699828624725, "kl": 0.024854660034179688, "lambda_div_used": 0.6, "learning_rate": 8.440392717955475e-07, "loss": 0.0856, "reward": 0.009342290461063385, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.009342290461063385, "reward_after_std": 0.8687301687896252, "reward_before_mean": 0.38826479855924845, "reward_before_std": 0.9071060288697481, "reward_change_max": 0.000700242817401886, "reward_change_mean": -0.37892252765595913, "reward_change_min": -0.8705897815525532, "reward_change_std": 0.34363609459251165, "reward_std": 0.868730217218399, "rewards/cosine_scaled_reward": -0.08711758349090815, "rewards/format_reward": 0.5625000074505806, "step": 173 }, { "advantage_max": 1.3192416802048683, "advantage_mean": -1.179675312990014e-08, "advantage_min": -0.6599837280809879, "advantage_std": 0.7320027984678745, "completion_length": 2793.7083892822266, "epoch": 0.19885714285714284, "grad_norm": 0.1302064061164856, "kl": 0.02942657470703125, "lambda_div_used": 0.6, "learning_rate": 8.416539554784089e-07, "loss": 0.0173, "reward": -0.11204979429021478, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.11204979429021478, "reward_after_std": 0.7320028059184551, "reward_before_mean": 0.22473372798413038, "reward_before_std": 0.7440117113292217, "reward_change_max": 0.0006036907434463501, "reward_change_mean": -0.3367835180833936, "reward_change_min": -0.6897301971912384, "reward_change_std": 0.27322917617857456, "reward_std": 0.7320028245449066, "rewards/cosine_scaled_reward": -0.14804981462657452, "rewards/format_reward": 0.5208333469927311, "step": 174 }, { "advantage_max": 1.2248894348740578, "advantage_mean": -1.117587122845265e-08, "advantage_min": -0.7270784452557564, "advantage_std": 0.6971467100083828, "completion_length": 2822.6458740234375, "epoch": 0.2, "grad_norm": 0.10310303419828415, "kl": 0.0259552001953125, "lambda_div_used": 0.6, "learning_rate": 8.392544243589427e-07, "loss": 0.0073, "reward": 0.09740146715193987, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09740146715193987, "reward_after_std": 0.6971467137336731, "reward_before_mean": 0.5614458620548248, "reward_before_std": 0.6951047461479902, "reward_change_max": 0.0006377026438713074, "reward_change_mean": -0.4640443716198206, "reward_change_min": -0.8764088936150074, "reward_change_std": 0.335317213088274, "reward_std": 0.6971467286348343, "rewards/cosine_scaled_reward": 0.03072291426360607, "rewards/format_reward": 0.5000000111758709, "step": 175 }, { "advantage_max": 1.6224799752235413, "advantage_mean": 6.208817349140361e-10, "advantage_min": -0.9095878675580025, "advantage_std": 0.9159829206764698, "completion_length": 2468.979263305664, "epoch": 0.20114285714285715, "grad_norm": 0.1898711621761322, "kl": 0.0229339599609375, "lambda_div_used": 0.6, "learning_rate": 8.368407953869103e-07, "loss": 0.0686, "reward": 0.14530806988477707, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.14530806988477707, "reward_after_std": 0.9159829206764698, "reward_before_mean": 0.5826609404757619, "reward_before_std": 0.9521896243095398, "reward_change_max": 0.00018244236707687378, "reward_change_mean": -0.4373528528958559, "reward_change_min": -0.8462734147906303, "reward_change_std": 0.35245630517601967, "reward_std": 0.9159829281270504, "rewards/cosine_scaled_reward": -0.03158620372414589, "rewards/format_reward": 0.6458333432674408, "step": 176 }, { "advantage_max": 1.484945572912693, "advantage_mean": 6.208817238118058e-09, "advantage_min": -0.7248594909906387, "advantage_std": 0.8175345957279205, "completion_length": 3197.666778564453, "epoch": 0.2022857142857143, "grad_norm": 0.19520598649978638, "kl": 0.031982421875, "lambda_div_used": 0.6, "learning_rate": 8.344131861991828e-07, "loss": 0.035, "reward": -0.06862121913582087, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.06862121913582087, "reward_after_std": 0.8175345845520496, "reward_before_mean": 0.27500755805522203, "reward_before_std": 0.8386104181408882, "reward_change_max": 0.00022308528423309326, "reward_change_mean": -0.34362876880913973, "reward_change_min": -0.7244202829897404, "reward_change_std": 0.28399574756622314, "reward_std": 0.8175345994532108, "rewards/cosine_scaled_reward": -0.10207957029342651, "rewards/format_reward": 0.4791666753590107, "step": 177 }, { "advantage_max": 1.3649575412273407, "advantage_mean": 2.483526828633842e-09, "advantage_min": -0.6267384588718414, "advantage_std": 0.7353874500840902, "completion_length": 2747.250015258789, "epoch": 0.20342857142857143, "grad_norm": 0.14075183868408203, "kl": 0.038970947265625, "lambda_div_used": 0.6, "learning_rate": 8.319717151140072e-07, "loss": 0.0136, "reward": -0.08781477063894272, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08781477063894272, "reward_after_std": 0.7353874556720257, "reward_before_mean": 0.2575212549418211, "reward_before_std": 0.7124766483902931, "reward_change_max": 0.00017629563808441162, "reward_change_mean": -0.34533602371811867, "reward_change_min": -0.6885601654648781, "reward_change_std": 0.26829767785966396, "reward_std": 0.7353874780237675, "rewards/cosine_scaled_reward": -0.10040604881942272, "rewards/format_reward": 0.4583333358168602, "step": 178 }, { "advantage_max": 0.8987370580434799, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.35452789813280106, "advantage_std": 0.471599405631423, "completion_length": 3075.7916870117188, "epoch": 0.20457142857142857, "grad_norm": 0.11661714315414429, "kl": 0.03214263916015625, "lambda_div_used": 0.6, "learning_rate": 8.295165011252396e-07, "loss": 0.0306, "reward": -0.35421258118003607, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.35421258118003607, "reward_after_std": 0.471599405631423, "reward_before_mean": -0.10331118968315423, "reward_before_std": 0.42847144044935703, "reward_change_max": 0.001559227705001831, "reward_change_mean": -0.25090140476822853, "reward_change_min": -0.44767534360289574, "reward_change_std": 0.17294680699706078, "reward_std": 0.4715994130820036, "rewards/cosine_scaled_reward": -0.19748893287032843, "rewards/format_reward": 0.2916666679084301, "step": 179 }, { "advantage_max": 1.2968221679329872, "advantage_mean": -8.692344621863413e-09, "advantage_min": -0.5777449980378151, "advantage_std": 0.6874905861914158, "completion_length": 2111.4166870117188, "epoch": 0.2057142857142857, "grad_norm": 0.12424161285161972, "kl": 0.02796173095703125, "lambda_div_used": 0.6, "learning_rate": 8.270476638965461e-07, "loss": -0.0256, "reward": 0.23284974694252014, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23284974694252014, "reward_after_std": 0.6874905936419964, "reward_before_mean": 0.7600893080234528, "reward_before_std": 0.5672389660030603, "reward_change_max": 0.0007742047309875488, "reward_change_mean": -0.5272395350039005, "reward_change_min": -0.8209475874900818, "reward_change_std": 0.31187237333506346, "reward_std": 0.6874906159937382, "rewards/cosine_scaled_reward": 0.06754461862146854, "rewards/format_reward": 0.625, "step": 180 }, { "advantage_max": 1.6038127765059471, "advantage_mean": -1.3659398168108794e-08, "advantage_min": -0.7976420260965824, "advantage_std": 0.8899489752948284, "completion_length": 3054.7500762939453, "epoch": 0.20685714285714285, "grad_norm": 0.19416145980358124, "kl": 0.033782958984375, "lambda_div_used": 0.6, "learning_rate": 8.245653237555705e-07, "loss": 0.0392, "reward": 0.1256917817518115, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1256917817518115, "reward_after_std": 0.8899489343166351, "reward_before_mean": 0.5621891398914158, "reward_before_std": 0.9076622389256954, "reward_change_max": 0.0, "reward_change_mean": -0.4364973697811365, "reward_change_min": -0.8991809338331223, "reward_change_std": 0.34503397159278393, "reward_std": 0.8899489492177963, "rewards/cosine_scaled_reward": 0.05192789062857628, "rewards/format_reward": 0.4583333432674408, "step": 181 }, { "advantage_max": 1.5165415294468403, "advantage_mean": -1.862645593320167e-09, "advantage_min": -0.7919049225747585, "advantage_std": 0.8274138532578945, "completion_length": 2300.3125228881836, "epoch": 0.208, "grad_norm": 0.13749708235263824, "kl": 0.02083587646484375, "lambda_div_used": 0.6, "learning_rate": 8.220696016880687e-07, "loss": 0.0187, "reward": 0.12074684211984277, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12074684211984277, "reward_after_std": 0.8274138383567333, "reward_before_mean": 0.5635696525569074, "reward_before_std": 0.8129143454134464, "reward_change_max": 0.0022213757038116455, "reward_change_mean": -0.442822827026248, "reward_change_min": -0.7646378092467785, "reward_change_std": 0.31069572921842337, "reward_std": 0.8274138383567333, "rewards/cosine_scaled_reward": -0.009881848469376564, "rewards/format_reward": 0.5833333414047956, "step": 182 }, { "advantage_max": 1.3605138957500458, "advantage_mean": -1.6142924996742636e-08, "advantage_min": -0.7550958581268787, "advantage_std": 0.7596499547362328, "completion_length": 2450.2083892822266, "epoch": 0.20914285714285713, "grad_norm": 0.15661555528640747, "kl": 0.04094696044921875, "lambda_div_used": 0.6, "learning_rate": 8.195606193320136e-07, "loss": 0.0469, "reward": 0.10832425020635128, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.10832425020635128, "reward_after_std": 0.7596499472856522, "reward_before_mean": 0.5617624840233475, "reward_before_std": 0.7516155876219273, "reward_change_max": 0.0, "reward_change_mean": -0.4534382503479719, "reward_change_min": -0.8723913095891476, "reward_change_std": 0.33325557969510555, "reward_std": 0.7596499547362328, "rewards/cosine_scaled_reward": -0.021202084608376026, "rewards/format_reward": 0.6041666679084301, "step": 183 }, { "advantage_max": 1.1743084266781807, "advantage_mean": 6.519258299864106e-09, "advantage_min": -0.45616957545280457, "advantage_std": 0.6174315102398396, "completion_length": 2967.000030517578, "epoch": 0.2102857142857143, "grad_norm": 0.089842788875103, "kl": 0.0363616943359375, "lambda_div_used": 0.6, "learning_rate": 8.170384989716657e-07, "loss": 0.0062, "reward": -0.3365051681175828, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3365051681175828, "reward_after_std": 0.6174314990639687, "reward_before_mean": -0.1119849169626832, "reward_before_std": 0.5883787106722593, "reward_change_max": 0.0007164105772972107, "reward_change_mean": -0.22452027909457684, "reward_change_min": -0.4067869149148464, "reward_change_std": 0.16255799168720841, "reward_std": 0.6174315363168716, "rewards/cosine_scaled_reward": -0.22265912871807814, "rewards/format_reward": 0.33333333395421505, "step": 184 }, { "advantage_max": 1.1450467370450497, "advantage_mean": -1.3038516155639002e-08, "advantage_min": -0.5757462717592716, "advantage_std": 0.6306551937013865, "completion_length": 2638.062530517578, "epoch": 0.21142857142857144, "grad_norm": 0.1604258418083191, "kl": 0.03490447998046875, "lambda_div_used": 0.6, "learning_rate": 8.145033635316128e-07, "loss": 0.0312, "reward": -0.12965551391243935, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.12965551391243935, "reward_after_std": 0.6306551937013865, "reward_before_mean": 0.2147392723709345, "reward_before_std": 0.6197530413046479, "reward_change_max": 0.00013187527656555176, "reward_change_mean": -0.34439481515437365, "reward_change_min": -0.6087727546691895, "reward_change_std": 0.24250369798392057, "reward_std": 0.6306551974266768, "rewards/cosine_scaled_reward": -0.11138035543262959, "rewards/format_reward": 0.43750000558793545, "step": 185 }, { "advantage_max": 1.08181943744421, "advantage_mean": 1.2417633032946185e-09, "advantage_min": -0.5867179185152054, "advantage_std": 0.6249041147530079, "completion_length": 3160.625045776367, "epoch": 0.21257142857142858, "grad_norm": 0.15922769904136658, "kl": 0.04046630859375, "lambda_div_used": 0.6, "learning_rate": 8.119553365707802e-07, "loss": 0.0482, "reward": -0.06183060258626938, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.06183060258626938, "reward_after_std": 0.6249041259288788, "reward_before_mean": 0.33093083277344704, "reward_before_std": 0.6239690035581589, "reward_change_max": 0.0009515807032585144, "reward_change_mean": -0.3927614507265389, "reward_change_min": -0.7076075598597527, "reward_change_std": 0.3012672569602728, "reward_std": 0.62490414083004, "rewards/cosine_scaled_reward": 0.009215403348207474, "rewards/format_reward": 0.31250000558793545, "step": 186 }, { "advantage_max": 1.072705127298832, "advantage_mean": 3.7252901874396116e-09, "advantage_min": -0.6445136219263077, "advantage_std": 0.603652723133564, "completion_length": 2336.2084045410156, "epoch": 0.21371428571428572, "grad_norm": 0.24794240295886993, "kl": 0.03890228271484375, "lambda_div_used": 0.6, "learning_rate": 8.093945422764069e-07, "loss": 0.0822, "reward": -0.029120448976755142, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.029120448976755142, "reward_after_std": 0.6036527305841446, "reward_before_mean": 0.37975111696869135, "reward_before_std": 0.5896169766783714, "reward_change_max": 0.0017495378851890564, "reward_change_mean": -0.408871547318995, "reward_change_min": -0.7276890836656094, "reward_change_std": 0.28704762924462557, "reward_std": 0.6036527529358864, "rewards/cosine_scaled_reward": -0.12262445967644453, "rewards/format_reward": 0.625000013038516, "step": 187 }, { "advantage_max": 1.2339450418949127, "advantage_mean": 1.6142924996742636e-08, "advantage_min": -0.5492425635457039, "advantage_std": 0.6971351355314255, "completion_length": 3499.0416870117188, "epoch": 0.21485714285714286, "grad_norm": 0.17811760306358337, "kl": 0.046234130859375, "lambda_div_used": 0.6, "learning_rate": 8.068211054579943e-07, "loss": 0.0231, "reward": -0.4334766957908869, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4334766957908869, "reward_after_std": 0.6971351690590382, "reward_before_mean": -0.264811088796705, "reward_before_std": 0.7778226062655449, "reward_change_max": 0.002253696322441101, "reward_change_mean": -0.16866560466587543, "reward_change_min": -0.6171305365860462, "reward_change_std": 0.24329949263483286, "reward_std": 0.697135217487812, "rewards/cosine_scaled_reward": -0.20532220881432295, "rewards/format_reward": 0.14583333767950535, "step": 188 }, { "advantage_max": 1.3194897770881653, "advantage_mean": 1.490116141589226e-08, "advantage_min": -0.5810928530991077, "advantage_std": 0.6943271197378635, "completion_length": 2505.7292251586914, "epoch": 0.216, "grad_norm": 0.1453278362751007, "kl": 0.039276123046875, "lambda_div_used": 0.6, "learning_rate": 8.04235151541222e-07, "loss": 0.0234, "reward": 0.0326367899106117, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0326367899106117, "reward_after_std": 0.6943271197378635, "reward_before_mean": 0.4496917873620987, "reward_before_std": 0.6022991053760052, "reward_change_max": 0.0003833100199699402, "reward_change_mean": -0.41705494094640017, "reward_change_min": -0.7159639894962311, "reward_change_std": 0.2671029521152377, "reward_std": 0.6943271309137344, "rewards/cosine_scaled_reward": -0.06682079844176769, "rewards/format_reward": 0.5833333395421505, "step": 189 }, { "advantage_max": 1.042035847902298, "advantage_mean": 7.45058070794613e-09, "advantage_min": -0.4305584356188774, "advantage_std": 0.5585471391677856, "completion_length": 2627.1458587646484, "epoch": 0.21714285714285714, "grad_norm": 0.14222775399684906, "kl": 0.0413818359375, "lambda_div_used": 0.6, "learning_rate": 8.01636806561836e-07, "loss": 0.0329, "reward": -0.0020353831350803375, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0020353831350803375, "reward_after_std": 0.5585471503436565, "reward_before_mean": 0.4232803890481591, "reward_before_std": 0.45277058705687523, "reward_change_max": 0.0002883821725845337, "reward_change_mean": -0.4253157516941428, "reward_change_min": -0.6929913498461246, "reward_change_std": 0.27376699820160866, "reward_std": 0.5585471540689468, "rewards/cosine_scaled_reward": -0.04877648875117302, "rewards/format_reward": 0.520833333954215, "step": 190 }, { "advantage_max": 1.2923248931765556, "advantage_mean": 1.8005570368018198e-08, "advantage_min": -0.6255582571029663, "advantage_std": 0.7107645235955715, "completion_length": 2654.1041717529297, "epoch": 0.21828571428571428, "grad_norm": 0.1821720153093338, "kl": 0.0473175048828125, "lambda_div_used": 0.6, "learning_rate": 7.990261971595048e-07, "loss": 0.0433, "reward": -0.10828890092670918, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10828890092670918, "reward_after_std": 0.7107645533978939, "reward_before_mean": 0.23151674802647904, "reward_before_std": 0.7050754111260176, "reward_change_max": 0.0015554353594779968, "reward_change_mean": -0.3398056421428919, "reward_change_min": -0.6694670245051384, "reward_change_std": 0.2771302107721567, "reward_std": 0.7107645682990551, "rewards/cosine_scaled_reward": -0.07174161821603775, "rewards/format_reward": 0.37500000931322575, "step": 191 }, { "advantage_max": 1.1991769075393677, "advantage_mean": -2.2972624191819335e-08, "advantage_min": -0.5654369220137596, "advantage_std": 0.6564156897366047, "completion_length": 3291.812530517578, "epoch": 0.21942857142857142, "grad_norm": 0.22897185385227203, "kl": 0.046600341796875, "lambda_div_used": 0.6, "learning_rate": 7.964034505716476e-07, "loss": 0.0558, "reward": -0.2808880601078272, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2808880601078272, "reward_after_std": 0.6564157009124756, "reward_before_mean": -0.024348472245037556, "reward_before_std": 0.6607515625655651, "reward_change_max": 0.000874444842338562, "reward_change_mean": -0.25653960881754756, "reward_change_min": -0.5323520973324776, "reward_change_std": 0.2226111926138401, "reward_std": 0.6564157158136368, "rewards/cosine_scaled_reward": -0.1788409072905779, "rewards/format_reward": 0.33333333767950535, "step": 192 }, { "advantage_max": 1.4964174404740334, "advantage_mean": 4.346171478264438e-09, "advantage_min": -0.7282641679048538, "advantage_std": 0.8228538744151592, "completion_length": 3145.250030517578, "epoch": 0.22057142857142858, "grad_norm": 0.2540678381919861, "kl": 0.045440673828125, "lambda_div_used": 0.6, "learning_rate": 7.93768694627233e-07, "loss": 0.0548, "reward": -0.13096075784415007, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13096075784415007, "reward_after_std": 0.8228538781404495, "reward_before_mean": 0.1752599613973871, "reward_before_std": 0.848308339715004, "reward_change_max": 0.0002414211630821228, "reward_change_mean": -0.3062206953763962, "reward_change_min": -0.6018543504178524, "reward_change_std": 0.2552442867308855, "reward_std": 0.8228538781404495, "rewards/cosine_scaled_reward": -0.06862002797424793, "rewards/format_reward": 0.3125000074505806, "step": 193 }, { "advantage_max": 1.5948434360325336, "advantage_mean": -2.421438738409165e-08, "advantage_min": -0.9445933252573013, "advantage_std": 0.8900459110736847, "completion_length": 2825.041732788086, "epoch": 0.22171428571428572, "grad_norm": 0.17521539330482483, "kl": 0.04534912109375, "lambda_div_used": 0.6, "learning_rate": 7.911220577405484e-07, "loss": 0.0347, "reward": 0.3043134193867445, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3043134193867445, "reward_after_std": 0.8900458980351686, "reward_before_mean": 0.8403315991163254, "reward_before_std": 0.8644460886716843, "reward_change_max": 0.0024929121136665344, "reward_change_mean": -0.5360182207077742, "reward_change_min": -0.9033717587590218, "reward_change_std": 0.38277727644890547, "reward_std": 0.890045927837491, "rewards/cosine_scaled_reward": 0.17016580794006586, "rewards/format_reward": 0.5000000149011612, "step": 194 }, { "advantage_max": 1.6198503710329533, "advantage_mean": 1.3659398501175701e-08, "advantage_min": -0.7073510959744453, "advantage_std": 0.8945747967809439, "completion_length": 3019.354232788086, "epoch": 0.22285714285714286, "grad_norm": 0.27445510029792786, "kl": 0.04925537109375, "lambda_div_used": 0.6, "learning_rate": 7.884636689049422e-07, "loss": 0.0478, "reward": 0.024609943851828575, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.024609943851828575, "reward_after_std": 0.8945747967809439, "reward_before_mean": 0.40517749823629856, "reward_before_std": 0.9060044959187508, "reward_change_max": 0.00021410733461380005, "reward_change_mean": -0.38056755252182484, "reward_change_min": -0.9209701716899872, "reward_change_std": 0.3530062697827816, "reward_std": 0.8945748265832663, "rewards/cosine_scaled_reward": 0.015088742948137224, "rewards/format_reward": 0.3750000074505806, "step": 195 }, { "advantage_max": 1.3014200776815414, "advantage_mean": 2.23517424569053e-08, "advantage_min": -0.6922896057367325, "advantage_std": 0.7207038998603821, "completion_length": 3295.3750610351562, "epoch": 0.224, "grad_norm": 0.1790717989206314, "kl": 0.0604248046875, "lambda_div_used": 0.6, "learning_rate": 7.857936576865356e-07, "loss": 0.0255, "reward": -0.20712384395301342, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.20712384395301342, "reward_after_std": 0.720703911036253, "reward_before_mean": 0.08003285154700279, "reward_before_std": 0.755801547318697, "reward_change_max": 0.0016879886388778687, "reward_change_mean": -0.28715668758377433, "reward_change_min": -0.610984530299902, "reward_change_std": 0.25006009358912706, "reward_std": 0.720703911036253, "rewards/cosine_scaled_reward": -0.06415024306625128, "rewards/format_reward": 0.20833334140479565, "step": 196 }, { "advantage_max": 1.5312520191073418, "advantage_mean": 3.1044084525255755e-09, "advantage_min": -1.0301610231399536, "advantage_std": 0.8983987234532833, "completion_length": 2006.6458740234375, "epoch": 0.22514285714285714, "grad_norm": 0.24713905155658722, "kl": 0.0618438720703125, "lambda_div_used": 0.6, "learning_rate": 7.831121542179086e-07, "loss": 0.0451, "reward": 0.3392133894376457, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3392133894376457, "reward_after_std": 0.8983987085521221, "reward_before_mean": 0.8995099812746048, "reward_before_std": 0.9543585330247879, "reward_change_max": 0.0011051148176193237, "reward_change_mean": -0.5602965541183949, "reward_change_min": -1.0811551474034786, "reward_change_std": 0.4375645313411951, "reward_std": 0.8983987122774124, "rewards/cosine_scaled_reward": 0.1893383078277111, "rewards/format_reward": 0.5208333432674408, "step": 197 }, { "advantage_max": 1.2494731955230236, "advantage_mean": -6.2088175156738146e-09, "advantage_min": -0.5514694266021252, "advantage_std": 0.6800418961793184, "completion_length": 2836.562530517578, "epoch": 0.22628571428571428, "grad_norm": 0.16534265875816345, "kl": 0.0721435546875, "lambda_div_used": 0.6, "learning_rate": 7.804192891917571e-07, "loss": 0.035, "reward": -0.16476232931017876, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.16476232931017876, "reward_after_std": 0.6800418831408024, "reward_before_mean": 0.14605081919580698, "reward_before_std": 0.6505880728363991, "reward_change_max": 0.005474165081977844, "reward_change_mean": -0.3108131578192115, "reward_change_min": -0.6044091917574406, "reward_change_std": 0.2463654656894505, "reward_std": 0.6800418980419636, "rewards/cosine_scaled_reward": -0.10405792016535997, "rewards/format_reward": 0.35416666977107525, "step": 198 }, { "advantage_max": 1.151242271065712, "advantage_mean": 6.208817127095756e-09, "advantage_min": -0.5328905023634434, "advantage_std": 0.6158375293016434, "completion_length": 2393.7500610351562, "epoch": 0.22742857142857142, "grad_norm": 0.165126234292984, "kl": 0.075531005859375, "lambda_div_used": 0.6, "learning_rate": 7.777151938545235e-07, "loss": 0.0263, "reward": -0.34041555039584637, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.34041555039584637, "reward_after_std": 0.615837536752224, "reward_before_mean": -0.11647915467619896, "reward_before_std": 0.6054133623838425, "reward_change_max": 0.0021468177437782288, "reward_change_mean": -0.22393639385700226, "reward_change_min": -0.47274230420589447, "reward_change_std": 0.1846913443878293, "reward_std": 0.6158375553786755, "rewards/cosine_scaled_reward": -0.17282291005176376, "rewards/format_reward": 0.22916666977107525, "step": 199 }, { "advantage_max": 1.5527655258774757, "advantage_mean": 6.208818459363386e-10, "advantage_min": -0.8078677505254745, "advantage_std": 0.8602624572813511, "completion_length": 2471.104202270508, "epoch": 0.22857142857142856, "grad_norm": 0.17417526245117188, "kl": 0.070556640625, "lambda_div_used": 0.6, "learning_rate": 7.75e-07, "loss": 0.0125, "reward": 0.07807190343737602, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07807190343737602, "reward_after_std": 0.8602624572813511, "reward_before_mean": 0.4883092427626252, "reward_before_std": 0.8743391893804073, "reward_change_max": 0.0013150349259376526, "reward_change_mean": -0.4102373067289591, "reward_change_min": -0.8249180130660534, "reward_change_std": 0.3349352069199085, "reward_std": 0.860262505710125, "rewards/cosine_scaled_reward": 0.02540461253374815, "rewards/format_reward": 0.4375000074505806, "step": 200 }, { "advantage_max": 1.4167628586292267, "advantage_mean": -1.1102230246251565e-16, "advantage_min": -0.7990463078022003, "advantage_std": 0.7999892868101597, "completion_length": 2076.2083587646484, "epoch": 0.2297142857142857, "grad_norm": 0.18805286288261414, "kl": 0.068634033203125, "lambda_div_used": 0.6, "learning_rate": 7.72273839962904e-07, "loss": 0.0186, "reward": 0.40698003210127354, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.40698003210127354, "reward_after_std": 0.7999892979860306, "reward_before_mean": 1.0221742703579366, "reward_before_std": 0.7416293583810329, "reward_change_max": 0.0007732957601547241, "reward_change_mean": -0.615194228477776, "reward_change_min": -0.9900593794882298, "reward_change_std": 0.41214887611567974, "reward_std": 0.7999893128871918, "rewards/cosine_scaled_reward": 0.24025380797684193, "rewards/format_reward": 0.5416666772216558, "step": 201 }, { "advantage_max": 1.3395941704511642, "advantage_mean": -2.3593506148777976e-08, "advantage_min": -0.6523739323019981, "advantage_std": 0.718066219240427, "completion_length": 2476.250015258789, "epoch": 0.23085714285714284, "grad_norm": 0.28353244066238403, "kl": 0.095794677734375, "lambda_div_used": 0.6, "learning_rate": 7.695368466124296e-07, "loss": -0.0281, "reward": 0.2649739682674408, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2649739682674408, "reward_after_std": 0.7180662304162979, "reward_before_mean": 0.808698242995888, "reward_before_std": 0.6214682869613171, "reward_change_max": 0.00035375356674194336, "reward_change_mean": -0.5437242835760117, "reward_change_min": -0.8757462315261364, "reward_change_std": 0.3358838642016053, "reward_std": 0.7180662527680397, "rewards/cosine_scaled_reward": 0.15434912405908108, "rewards/format_reward": 0.5000000037252903, "step": 202 }, { "advantage_max": 1.223877239972353, "advantage_mean": -3.1044085080367267e-09, "advantage_min": -0.620630256831646, "advantage_std": 0.6635248996317387, "completion_length": 2952.5833587646484, "epoch": 0.232, "grad_norm": 0.2589733600616455, "kl": 0.09796142578125, "lambda_div_used": 0.6, "learning_rate": 7.667891533457718e-07, "loss": 0.0524, "reward": -0.05526367016136646, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05526367016136646, "reward_after_std": 0.6635248959064484, "reward_before_mean": 0.3243457265198231, "reward_before_std": 0.6201132908463478, "reward_change_max": 0.0011578574776649475, "reward_change_mean": -0.3796093687415123, "reward_change_min": -0.6677472069859505, "reward_change_std": 0.2712924610823393, "reward_std": 0.6635249182581902, "rewards/cosine_scaled_reward": 0.026756178587675095, "rewards/format_reward": 0.2708333395421505, "step": 203 }, { "advantage_max": 1.1059998497366905, "advantage_mean": 7.140140145800444e-09, "advantage_min": -0.591538067907095, "advantage_std": 0.6298722177743912, "completion_length": 2211.312545776367, "epoch": 0.23314285714285715, "grad_norm": 0.191162571310997, "kl": 0.09075927734375, "lambda_div_used": 0.6, "learning_rate": 7.640308940816239e-07, "loss": 0.0189, "reward": 0.08096916507929564, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08096916507929564, "reward_after_std": 0.6298722326755524, "reward_before_mean": 0.5430826265364885, "reward_before_std": 0.589900765568018, "reward_change_max": 0.0, "reward_change_mean": -0.46211343444883823, "reward_change_min": -0.8006623312830925, "reward_change_std": 0.31877398304641247, "reward_std": 0.6298722624778748, "rewards/cosine_scaled_reward": -0.0826253816485405, "rewards/format_reward": 0.7083333395421505, "step": 204 }, { "advantage_max": 1.4411217793822289, "advantage_mean": -1.6763806842678974e-08, "advantage_min": -0.9071615114808083, "advantage_std": 0.8484151288866997, "completion_length": 2679.2708740234375, "epoch": 0.2342857142857143, "grad_norm": 0.2707881033420563, "kl": 0.08770751953125, "lambda_div_used": 0.6, "learning_rate": 7.612622032536507e-07, "loss": 0.0232, "reward": 0.11488889902830124, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11488889902830124, "reward_after_std": 0.8484151251614094, "reward_before_mean": 0.564969852566719, "reward_before_std": 0.9211336225271225, "reward_change_max": 0.0, "reward_change_mean": -0.450080968439579, "reward_change_min": -0.8733410649001598, "reward_change_std": 0.380622087046504, "reward_std": 0.8484151512384415, "rewards/cosine_scaled_reward": 0.10540158860385418, "rewards/format_reward": 0.354166679084301, "step": 205 }, { "advantage_max": 1.9362114071846008, "advantage_mean": 6.208817904251873e-10, "advantage_min": -0.6879184618592262, "advantage_std": 1.0034943111240864, "completion_length": 3153.7500228881836, "epoch": 0.23542857142857143, "grad_norm": 0.40357130765914917, "kl": 0.1099853515625, "lambda_div_used": 0.6, "learning_rate": 7.584832158039378e-07, "loss": 0.0197, "reward": -0.1742401469964534, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1742401469964534, "reward_after_std": 1.0034943148493767, "reward_before_mean": 0.05573885515332222, "reward_before_std": 0.9908393323421478, "reward_change_max": 0.00036820024251937866, "reward_change_mean": -0.22997901123017073, "reward_change_min": -0.5386604145169258, "reward_change_std": 0.21234728395938873, "reward_std": 1.0034943595528603, "rewards/cosine_scaled_reward": -0.10754724405705929, "rewards/format_reward": 0.2708333358168602, "step": 206 }, { "advantage_max": 1.4247949346899986, "advantage_mean": -1.1102230246251565e-16, "advantage_min": -0.6534974798560143, "advantage_std": 0.7754468694329262, "completion_length": 2983.0625610351562, "epoch": 0.23657142857142857, "grad_norm": 0.3701333701610565, "kl": 0.12939453125, "lambda_div_used": 0.6, "learning_rate": 7.556940671764124e-07, "loss": -0.0036, "reward": -0.2597499608527869, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2597499608527869, "reward_after_std": 0.775446891784668, "reward_before_mean": -0.024273216724395752, "reward_before_std": 0.8059664107859135, "reward_change_max": 0.005939692258834839, "reward_change_mean": -0.23547676112502813, "reward_change_min": -0.5890285782516003, "reward_change_std": 0.24587653204798698, "reward_std": 0.7754469364881516, "rewards/cosine_scaled_reward": -0.17880328325554729, "rewards/format_reward": 0.33333333767950535, "step": 207 }, { "advantage_max": 1.4248832762241364, "advantage_mean": 6.829699306099002e-09, "advantage_min": -0.7578137814998627, "advantage_std": 0.7782737948000431, "completion_length": 2421.3125762939453, "epoch": 0.2377142857142857, "grad_norm": 0.2831918001174927, "kl": 0.097442626953125, "lambda_div_used": 0.6, "learning_rate": 7.528948933102438e-07, "loss": 0.0003, "reward": 0.18712701415643096, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18712701415643096, "reward_after_std": 0.7782737948000431, "reward_before_mean": 0.6776773352175951, "reward_before_std": 0.7428862564265728, "reward_change_max": 0.0, "reward_change_mean": -0.49055031593889, "reward_change_min": -0.8148558661341667, "reward_change_std": 0.32612331211566925, "reward_std": 0.778273805975914, "rewards/cosine_scaled_reward": 0.057588656432926655, "rewards/format_reward": 0.5625000055879354, "step": 208 }, { "advantage_max": 1.187922965735197, "advantage_mean": 1.8626449271863521e-09, "advantage_min": -0.47378795593976974, "advantage_std": 0.631999060511589, "completion_length": 2732.4375228881836, "epoch": 0.23885714285714285, "grad_norm": 0.2125997692346573, "kl": 0.128662109375, "lambda_div_used": 0.6, "learning_rate": 7.500858306332172e-07, "loss": 0.0075, "reward": 0.10858845058828592, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.10858845058828592, "reward_after_std": 0.6319990754127502, "reward_before_mean": 0.5761509407311678, "reward_before_std": 0.5213535856455564, "reward_change_max": 0.002324998378753662, "reward_change_mean": -0.467562448233366, "reward_change_min": -0.7992461994290352, "reward_change_std": 0.29136813152581453, "reward_std": 0.6319991014897823, "rewards/cosine_scaled_reward": 0.058908781968057156, "rewards/format_reward": 0.4583333358168602, "step": 209 }, { "advantage_max": 1.3186798729002476, "advantage_mean": -3.1044086745701804e-10, "advantage_min": -0.5938226599246264, "advantage_std": 0.7046267911791801, "completion_length": 2671.250045776367, "epoch": 0.24, "grad_norm": 0.25899404287338257, "kl": 0.10009765625, "lambda_div_used": 0.6, "learning_rate": 7.472670160550848e-07, "loss": 0.0317, "reward": 0.018894458189606667, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.018894458189606667, "reward_after_std": 0.7046268098056316, "reward_before_mean": 0.4229349633678794, "reward_before_std": 0.6473653120920062, "reward_change_max": 0.0013660937547683716, "reward_change_mean": -0.40404045954346657, "reward_change_min": -0.6701996028423309, "reward_change_std": 0.27499571815133095, "reward_std": 0.7046268302947283, "rewards/cosine_scaled_reward": 0.013550772797316313, "rewards/format_reward": 0.39583333395421505, "step": 210 }, { "advantage_max": 1.372512400150299, "advantage_mean": -2.110997859849917e-08, "advantage_min": -0.6729023642838001, "advantage_std": 0.7465317733585835, "completion_length": 2482.0833740234375, "epoch": 0.24114285714285713, "grad_norm": 0.31670844554901123, "kl": 0.12445068359375, "lambda_div_used": 0.6, "learning_rate": 7.444385869608921e-07, "loss": 0.0257, "reward": -0.019909057766199112, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.019909057766199112, "reward_after_std": 0.7465317510068417, "reward_before_mean": 0.35514324717223644, "reward_before_std": 0.7346069552004337, "reward_change_max": 0.001021161675453186, "reward_change_mean": -0.37505233788397163, "reward_change_min": -0.6400614865124226, "reward_change_std": 0.26454370305873454, "reward_std": 0.7465317659080029, "rewards/cosine_scaled_reward": -0.06201172433793545, "rewards/format_reward": 0.479166679084301, "step": 211 }, { "advantage_max": 1.4231358543038368, "advantage_mean": -8.071462720415923e-09, "advantage_min": -0.8575638234615326, "advantage_std": 0.8098030164837837, "completion_length": 2606.6250762939453, "epoch": 0.2422857142857143, "grad_norm": 0.40781182050704956, "kl": 0.128204345703125, "lambda_div_used": 0.6, "learning_rate": 7.416006812042827e-07, "loss": 0.0841, "reward": 0.09332763217389584, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09332763217389584, "reward_after_std": 0.8098030164837837, "reward_before_mean": 0.5334172658622265, "reward_before_std": 0.840537752956152, "reward_change_max": 0.0007770806550979614, "reward_change_mean": -0.44008962251245975, "reward_change_min": -0.8539436981081963, "reward_change_std": 0.3552567921578884, "reward_std": 0.8098030164837837, "rewards/cosine_scaled_reward": 0.016708621755242348, "rewards/format_reward": 0.5000000074505806, "step": 212 }, { "advantage_max": 1.6412783414125443, "advantage_mean": -1.1175871450497255e-08, "advantage_min": -0.8117294907569885, "advantage_std": 0.8886349983513355, "completion_length": 2581.3541870117188, "epoch": 0.24342857142857144, "grad_norm": 0.286513090133667, "kl": 0.1539306640625, "lambda_div_used": 0.6, "learning_rate": 7.387534371007797e-07, "loss": 0.0005, "reward": 0.2363802082836628, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2363802082836628, "reward_after_std": 0.8886349834501743, "reward_before_mean": 0.724748283624649, "reward_before_std": 0.8444754183292389, "reward_change_max": 0.0027549341320991516, "reward_change_mean": -0.48836808931082487, "reward_change_min": -0.9352367371320724, "reward_change_std": 0.35737268533557653, "reward_std": 0.8886350020766258, "rewards/cosine_scaled_reward": 0.07070746086537838, "rewards/format_reward": 0.5833333395421505, "step": 213 }, { "advantage_max": 1.603540975600481, "advantage_mean": 5.587935447692871e-09, "advantage_min": -0.8505863398313522, "advantage_std": 0.9302494525909424, "completion_length": 2973.0625610351562, "epoch": 0.24457142857142858, "grad_norm": 0.411465585231781, "kl": 0.144775390625, "lambda_div_used": 0.6, "learning_rate": 7.358969934210438e-07, "loss": 0.0687, "reward": 0.009196583181619644, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.009196583181619644, "reward_after_std": 0.9302494451403618, "reward_before_mean": 0.37914169020950794, "reward_before_std": 1.0218745321035385, "reward_change_max": 6.617605686187744e-05, "reward_change_mean": -0.36994508653879166, "reward_change_min": -0.9427313208580017, "reward_change_std": 0.3748389510437846, "reward_std": 0.930249460041523, "rewards/cosine_scaled_reward": -0.039595833979547024, "rewards/format_reward": 0.4583333395421505, "step": 214 }, { "advantage_max": 0.8581725731492043, "advantage_mean": 9.313226634333205e-09, "advantage_min": -0.40347205474972725, "advantage_std": 0.46334000304341316, "completion_length": 2574.8541946411133, "epoch": 0.24571428571428572, "grad_norm": 0.13390909135341644, "kl": 0.13763427734375, "lambda_div_used": 0.6, "learning_rate": 7.330314893841101e-07, "loss": 0.0165, "reward": -0.2104439791291952, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2104439791291952, "reward_after_std": 0.46334001794457436, "reward_before_mean": 0.11830021068453789, "reward_before_std": 0.4097316600382328, "reward_change_max": 0.001212291419506073, "reward_change_mean": -0.3287441972643137, "reward_change_min": -0.5551458112895489, "reward_change_std": 0.20913261640816927, "reward_std": 0.46334002912044525, "rewards/cosine_scaled_reward": -0.1804332360625267, "rewards/format_reward": 0.47916668094694614, "step": 215 }, { "advantage_max": 1.5061615630984306, "advantage_mean": -2.4835269951672956e-09, "advantage_min": -0.8598283156752586, "advantage_std": 0.8617348670959473, "completion_length": 2333.916702270508, "epoch": 0.24685714285714286, "grad_norm": 0.6332868933677673, "kl": 0.17523193359375, "lambda_div_used": 0.6, "learning_rate": 7.301570646506027e-07, "loss": 0.0876, "reward": 0.04702269285917282, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04702269285917282, "reward_after_std": 0.8617348782718182, "reward_before_mean": 0.44580378383398056, "reward_before_std": 0.9096739925444126, "reward_change_max": 0.0006563737988471985, "reward_change_mean": -0.39878107607364655, "reward_change_min": -0.8786581009626389, "reward_change_std": 0.35184421949088573, "reward_std": 0.8617349080741405, "rewards/cosine_scaled_reward": -0.07918145949952304, "rewards/format_reward": 0.6041666772216558, "step": 216 }, { "advantage_max": 1.6048918068408966, "advantage_mean": -4.967053990334591e-09, "advantage_min": -0.7554399445652962, "advantage_std": 0.891699094325304, "completion_length": 2728.770851135254, "epoch": 0.248, "grad_norm": 0.47174057364463806, "kl": 0.17547607421875, "lambda_div_used": 0.6, "learning_rate": 7.27273859315928e-07, "loss": 0.0602, "reward": 0.09491589106619358, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09491589106619358, "reward_after_std": 0.8916990831494331, "reward_before_mean": 0.5138254193589091, "reward_before_std": 0.9023692905902863, "reward_change_max": 0.0, "reward_change_mean": -0.418909530621022, "reward_change_min": -0.8360408432781696, "reward_change_std": 0.3293918455019593, "reward_std": 0.8916990980505943, "rewards/cosine_scaled_reward": 0.06941270176321268, "rewards/format_reward": 0.3750000074505806, "step": 217 }, { "advantage_max": 1.4350545406341553, "advantage_mean": 2.4835269007983385e-08, "advantage_min": -0.6967899017035961, "advantage_std": 0.7781109362840652, "completion_length": 2717.8750762939453, "epoch": 0.24914285714285714, "grad_norm": 0.4780595600605011, "kl": 0.17071533203125, "lambda_div_used": 0.6, "learning_rate": 7.243820139034464e-07, "loss": 0.0052, "reward": -0.06596539542078972, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.06596539542078972, "reward_after_std": 0.7781109698116779, "reward_before_mean": 0.27800588123500347, "reward_before_std": 0.7713658884167671, "reward_change_max": 0.0009939447045326233, "reward_change_mean": -0.3439712468534708, "reward_change_min": -0.6784108616411686, "reward_change_std": 0.27120585925877094, "reward_std": 0.7781109921634197, "rewards/cosine_scaled_reward": -0.09016373474150896, "rewards/format_reward": 0.45833334513008595, "step": 218 }, { "advantage_max": 1.515500433743, "advantage_mean": -9.934107203513065e-09, "advantage_min": -0.6553824432194233, "advantage_std": 0.8291011936962605, "completion_length": 2393.9583892822266, "epoch": 0.2502857142857143, "grad_norm": 0.35546576976776123, "kl": 0.21331787109375, "lambda_div_used": 0.6, "learning_rate": 7.214816693576234e-07, "loss": 0.0379, "reward": 0.06899490812793374, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06899490812793374, "reward_after_std": 0.829101212322712, "reward_before_mean": 0.479990987572819, "reward_before_std": 0.8050318211317062, "reward_change_max": 0.0008395984768867493, "reward_change_mean": -0.4109960775822401, "reward_change_min": -0.8359643630683422, "reward_change_std": 0.32415905967354774, "reward_std": 0.8291012309491634, "rewards/cosine_scaled_reward": -0.01000452577136457, "rewards/format_reward": 0.5000000111758709, "step": 219 }, { "advantage_max": 0.7503951676189899, "advantage_mean": 1.2417634420724966e-08, "advantage_min": -0.37694836407899857, "advantage_std": 0.4157340805977583, "completion_length": 2755.1250228881836, "epoch": 0.25142857142857145, "grad_norm": 0.26302239298820496, "kl": 0.22412109375, "lambda_div_used": 0.6, "learning_rate": 7.185729670371604e-07, "loss": 0.03, "reward": -0.4798909847741015, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4798909847741015, "reward_after_std": 0.41573409363627434, "reward_before_mean": -0.28886884544044733, "reward_before_std": 0.4156207535415888, "reward_change_max": 0.0006025433540344238, "reward_change_mean": -0.19102213997393847, "reward_change_min": -0.3795706331729889, "reward_change_std": 0.15820336434990168, "reward_std": 0.41573410853743553, "rewards/cosine_scaled_reward": -0.2798510938882828, "rewards/format_reward": 0.27083333395421505, "step": 220 }, { "advantage_max": 1.3563694432377815, "advantage_mean": -1.8626449826975033e-09, "advantage_min": -0.6945677511394024, "advantage_std": 0.7419977821409702, "completion_length": 1854.3958587646484, "epoch": 0.25257142857142856, "grad_norm": 0.2989541292190552, "kl": 0.149749755859375, "lambda_div_used": 0.6, "learning_rate": 7.156560487081051e-07, "loss": -0.0129, "reward": 0.10412659542635083, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.10412659542635083, "reward_after_std": 0.7419977821409702, "reward_before_mean": 0.5544167058542371, "reward_before_std": 0.696466002613306, "reward_change_max": 0.00045955926179885864, "reward_change_mean": -0.4502900801599026, "reward_change_min": -0.8158979564905167, "reward_change_std": 0.3151663765311241, "reward_std": 0.7419977933168411, "rewards/cosine_scaled_reward": 0.01679166965186596, "rewards/format_reward": 0.520833345130086, "step": 221 }, { "advantage_max": 1.2615111097693443, "advantage_mean": -3.7485734682984884e-08, "advantage_min": -0.8112612888216972, "advantage_std": 0.7181127965450287, "completion_length": 2456.375015258789, "epoch": 0.2537142857142857, "grad_norm": 0.4287218153476715, "kl": 0.2139892578125, "lambda_div_used": 0.6, "learning_rate": 7.127310565369415e-07, "loss": 0.0029, "reward": 0.12237783218733966, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12237783218733966, "reward_after_std": 0.7181127928197384, "reward_before_mean": 0.5944197215139866, "reward_before_std": 0.7205285355448723, "reward_change_max": 0.00046860426664352417, "reward_change_mean": -0.47204189747571945, "reward_change_min": -0.8188545815646648, "reward_change_std": 0.3446765150874853, "reward_std": 0.7181127965450287, "rewards/cosine_scaled_reward": 0.005543181672692299, "rewards/format_reward": 0.5833333395421505, "step": 222 }, { "advantage_max": 1.2473274320363998, "advantage_mean": 9.934108091691485e-09, "advantage_min": -0.5829054936766624, "advantage_std": 0.6753346417099237, "completion_length": 2532.208381652832, "epoch": 0.25485714285714284, "grad_norm": 0.23588241636753082, "kl": 0.20941162109375, "lambda_div_used": 0.6, "learning_rate": 7.097981330836616e-07, "loss": 0.0158, "reward": -0.045638011768460274, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.045638011768460274, "reward_after_std": 0.6753346417099237, "reward_before_mean": 0.3341361992061138, "reward_before_std": 0.6450802572071552, "reward_change_max": 0.0019773244857788086, "reward_change_mean": -0.3797741485759616, "reward_change_min": -0.6561138592660427, "reward_change_std": 0.25462134182453156, "reward_std": 0.6753346435725689, "rewards/cosine_scaled_reward": -0.08293193019926548, "rewards/format_reward": 0.5000000093132257, "step": 223 }, { "advantage_max": 1.8461582660675049, "advantage_mean": -1.2417632477834672e-09, "advantage_min": -0.8299898952245712, "advantage_std": 1.0023904666304588, "completion_length": 2865.1250762939453, "epoch": 0.256, "grad_norm": 0.8151166439056396, "kl": 0.2145233154296875, "lambda_div_used": 0.6, "learning_rate": 7.068574212948169e-07, "loss": 0.0747, "reward": 0.04341099318116903, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.04341099318116903, "reward_after_std": 1.002390455454588, "reward_before_mean": 0.40595949813723564, "reward_before_std": 1.0213181041181087, "reward_change_max": 0.001161724328994751, "reward_change_mean": -0.3625485133379698, "reward_change_min": -0.7576224133372307, "reward_change_std": 0.3075553746894002, "reward_std": 1.0023904591798782, "rewards/cosine_scaled_reward": 0.046729736030101776, "rewards/format_reward": 0.31250000931322575, "step": 224 }, { "advantage_max": 1.160877875983715, "advantage_mean": -1.241763414316921e-09, "advantage_min": -0.5418887920677662, "advantage_std": 0.6181628629565239, "completion_length": 2925.6667098999023, "epoch": 0.2571428571428571, "grad_norm": 0.48488402366638184, "kl": 0.2423095703125, "lambda_div_used": 0.6, "learning_rate": 7.039090644965509e-07, "loss": 0.0407, "reward": -0.09848776552826166, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.09848776552826166, "reward_after_std": 0.6181628704071045, "reward_before_mean": 0.2601957079023123, "reward_before_std": 0.5574781373143196, "reward_change_max": 0.0013304725289344788, "reward_change_mean": -0.3586834678426385, "reward_change_min": -0.6151060201227665, "reward_change_std": 0.24173601809889078, "reward_std": 0.6181629002094269, "rewards/cosine_scaled_reward": -0.08865214767865837, "rewards/format_reward": 0.43750000558793545, "step": 225 }, { "advantage_max": 1.6093599423766136, "advantage_mean": 4.3461718668424965e-09, "advantage_min": -0.77116759121418, "advantage_std": 0.8485618270933628, "completion_length": 2593.7083740234375, "epoch": 0.2582857142857143, "grad_norm": 0.4360467195510864, "kl": 0.2156982421875, "lambda_div_used": 0.6, "learning_rate": 7.009532063876148e-07, "loss": 0.0137, "reward": 0.37242993898689747, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.37242993898689747, "reward_after_std": 0.8485618270933628, "reward_before_mean": 0.9451485552563099, "reward_before_std": 0.7240049056708813, "reward_change_max": 0.0004467219114303589, "reward_change_mean": -0.5727185849100351, "reward_change_min": -0.9093927554786205, "reward_change_std": 0.35986475832760334, "reward_std": 0.8485618606209755, "rewards/cosine_scaled_reward": 0.20174093917012215, "rewards/format_reward": 0.5416666697710752, "step": 226 }, { "advantage_max": 1.6921913027763367, "advantage_mean": -2.2351741790771484e-08, "advantage_min": -0.8459499292075634, "advantage_std": 0.9304548464715481, "completion_length": 2420.333381652832, "epoch": 0.25942857142857145, "grad_norm": 0.5740240812301636, "kl": 0.211822509765625, "lambda_div_used": 0.6, "learning_rate": 6.979899910323624e-07, "loss": 0.0551, "reward": 0.050073117949068546, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.050073117949068546, "reward_after_std": 0.9304548241198063, "reward_before_mean": 0.43043838534504175, "reward_before_std": 0.9527036100625992, "reward_change_max": 0.0013343244791030884, "reward_change_mean": -0.38036529161036015, "reward_change_min": -0.8623692579567432, "reward_change_std": 0.32892103493213654, "reward_std": 0.9304548390209675, "rewards/cosine_scaled_reward": -0.04519747570157051, "rewards/format_reward": 0.5208333469927311, "step": 227 }, { "advantage_max": 1.1782990470528603, "advantage_mean": -1.2417634698280722e-08, "advantage_min": -0.599372997879982, "advantage_std": 0.631151270121336, "completion_length": 2588.812545776367, "epoch": 0.26057142857142856, "grad_norm": 0.1961567997932434, "kl": 0.232666015625, "lambda_div_used": 0.6, "learning_rate": 6.950195628537299e-07, "loss": 0.0311, "reward": 0.004836801439523697, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.004836801439523697, "reward_after_std": 0.6311512663960457, "reward_before_mean": 0.4207380283623934, "reward_before_std": 0.5706529878079891, "reward_change_max": 0.0011245310306549072, "reward_change_mean": -0.415901237167418, "reward_change_min": -0.6850415766239166, "reward_change_std": 0.2639042199589312, "reward_std": 0.6311512775719166, "rewards/cosine_scaled_reward": 0.002035675570368767, "rewards/format_reward": 0.41666667349636555, "step": 228 }, { "advantage_max": 1.0126456022262573, "advantage_mean": -1.241763414316921e-09, "advantage_min": -0.4646928757429123, "advantage_std": 0.549139428883791, "completion_length": 2947.729217529297, "epoch": 0.26171428571428573, "grad_norm": 0.32021525502204895, "kl": 0.3187255859375, "lambda_div_used": 0.6, "learning_rate": 6.920420666261961e-07, "loss": 0.0374, "reward": -0.21300336718559265, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21300336718559265, "reward_after_std": 0.549139428883791, "reward_before_mean": 0.09695982187986374, "reward_before_std": 0.5107403621077538, "reward_change_max": 0.002090074121952057, "reward_change_mean": -0.3099631778895855, "reward_change_min": -0.5851683095097542, "reward_change_std": 0.2205530758947134, "reward_std": 0.5491394437849522, "rewards/cosine_scaled_reward": -0.055686766281723976, "rewards/format_reward": 0.2083333358168602, "step": 229 }, { "advantage_max": 1.4498926997184753, "advantage_mean": 1.2417634254191512e-08, "advantage_min": -0.5673705451190472, "advantage_std": 0.7717844881117344, "completion_length": 3185.7708740234375, "epoch": 0.26285714285714284, "grad_norm": 0.4693439304828644, "kl": 0.3175048828125, "lambda_div_used": 0.6, "learning_rate": 6.890576474687263e-07, "loss": 0.0575, "reward": -0.2647488545626402, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2647488545626402, "reward_after_std": 0.7717844769358635, "reward_before_mean": -0.028830239549279213, "reward_before_std": 0.7798666022717953, "reward_change_max": 0.0021414458751678467, "reward_change_mean": -0.2359186140820384, "reward_change_min": -0.6099616996943951, "reward_change_std": 0.22370487917214632, "reward_std": 0.771784495562315, "rewards/cosine_scaled_reward": -0.16024844953790307, "rewards/format_reward": 0.29166667349636555, "step": 230 }, { "advantage_max": 1.1504193618893623, "advantage_mean": 3.1044091186593903e-09, "advantage_min": -0.4140487276017666, "advantage_std": 0.5968481115996838, "completion_length": 2979.041717529297, "epoch": 0.264, "grad_norm": 0.42601606249809265, "kl": 0.343231201171875, "lambda_div_used": 0.6, "learning_rate": 6.860664508377001e-07, "loss": 0.0293, "reward": -0.21346711833029985, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21346711833029985, "reward_after_std": 0.5968481153249741, "reward_before_mean": 0.08721326105296612, "reward_before_std": 0.5268659126013517, "reward_change_max": 0.0014610588550567627, "reward_change_mean": -0.3006803933531046, "reward_change_min": -0.5531118325889111, "reward_change_std": 0.20393710862845182, "reward_std": 0.5968481171876192, "rewards/cosine_scaled_reward": -0.13347670319490135, "rewards/format_reward": 0.35416666977107525, "step": 231 }, { "advantage_max": 1.2305606603622437, "advantage_mean": 9.934108036180334e-09, "advantage_min": -0.46694882586598396, "advantage_std": 0.6468341052532196, "completion_length": 3071.7708587646484, "epoch": 0.2651428571428571, "grad_norm": 0.272504061460495, "kl": 0.3377685546875, "lambda_div_used": 0.6, "learning_rate": 6.83068622519821e-07, "loss": 0.0473, "reward": -0.31791983102448285, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.31791983102448285, "reward_after_std": 0.6468341238796711, "reward_before_mean": -0.08504897356033325, "reward_before_std": 0.6210133656859398, "reward_change_max": 0.002808481454849243, "reward_change_mean": -0.23287085350602865, "reward_change_min": -0.5016565397381783, "reward_change_std": 0.1863990006968379, "reward_std": 0.6468341499567032, "rewards/cosine_scaled_reward": -0.2300244935322553, "rewards/format_reward": 0.3750000074505806, "step": 232 }, { "advantage_max": 1.2980494424700737, "advantage_mean": 7.45058070794613e-09, "advantage_min": -0.6965583972632885, "advantage_std": 0.7367055304348469, "completion_length": 2740.0209045410156, "epoch": 0.2662857142857143, "grad_norm": 0.794073760509491, "kl": 0.32879638671875, "lambda_div_used": 0.6, "learning_rate": 6.800643086250121e-07, "loss": 0.0914, "reward": -0.07508878409862518, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07508878409862518, "reward_after_std": 0.7367055304348469, "reward_before_mean": 0.28439231449738145, "reward_before_std": 0.7708801850676537, "reward_change_max": 0.0, "reward_change_mean": -0.35948106786236167, "reward_change_min": -0.7288849875330925, "reward_change_std": 0.29315769439563155, "reward_std": 0.7367055453360081, "rewards/cosine_scaled_reward": -0.07655386440455914, "rewards/format_reward": 0.43750000931322575, "step": 233 }, { "advantage_max": 1.110601656138897, "advantage_mean": 1.7695129667094633e-08, "advantage_min": -0.6541510969400406, "advantage_std": 0.6425810307264328, "completion_length": 2760.6459007263184, "epoch": 0.2674285714285714, "grad_norm": 0.41915634274482727, "kl": 0.32171630859375, "lambda_div_used": 0.6, "learning_rate": 6.770536555792944e-07, "loss": 0.0248, "reward": -0.13608171977102757, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.13608171977102757, "reward_after_std": 0.6425810270011425, "reward_before_mean": 0.2089826725423336, "reward_before_std": 0.676423005759716, "reward_change_max": 0.0001988634467124939, "reward_change_mean": -0.34506439371034503, "reward_change_min": -0.6399761959910393, "reward_change_std": 0.2727101487107575, "reward_std": 0.642581045627594, "rewards/cosine_scaled_reward": -0.1038419995456934, "rewards/format_reward": 0.41666666977107525, "step": 234 }, { "advantage_max": 1.3879477083683014, "advantage_mean": -1.2417634809303024e-08, "advantage_min": -0.702332578599453, "advantage_std": 0.7649212591350079, "completion_length": 2257.4791717529297, "epoch": 0.26857142857142857, "grad_norm": 0.5285465121269226, "kl": 0.29833984375, "lambda_div_used": 0.6, "learning_rate": 6.740368101176495e-07, "loss": 0.0536, "reward": 0.12406621873378754, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12406621873378754, "reward_after_std": 0.7649212367832661, "reward_before_mean": 0.5831809509545565, "reward_before_std": 0.7322921007871628, "reward_change_max": 0.0016368404030799866, "reward_change_mean": -0.45911472756415606, "reward_change_min": -0.8515269458293915, "reward_change_std": 0.32818734738975763, "reward_std": 0.764921247959137, "rewards/cosine_scaled_reward": 0.02075712662190199, "rewards/format_reward": 0.5416666716337204, "step": 235 }, { "advantage_max": 1.4723257198929787, "advantage_mean": 1.6142925107764938e-08, "advantage_min": -0.6119029596447945, "advantage_std": 0.7948046922683716, "completion_length": 2917.666748046875, "epoch": 0.26971428571428574, "grad_norm": 0.39669105410575867, "kl": 0.313720703125, "lambda_div_used": 0.6, "learning_rate": 6.710139192768694e-07, "loss": 0.0282, "reward": -0.14454367384314537, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.14454367384314537, "reward_after_std": 0.7948046959936619, "reward_before_mean": 0.15732058137655258, "reward_before_std": 0.7951191272586584, "reward_change_max": 0.0004043877124786377, "reward_change_mean": -0.30186421098187566, "reward_change_min": -0.7042200863361359, "reward_change_std": 0.26512502413243055, "reward_std": 0.7948047444224358, "rewards/cosine_scaled_reward": -0.14008972607553005, "rewards/format_reward": 0.4375000037252903, "step": 236 }, { "advantage_max": 1.273007720708847, "advantage_mean": -1.9868215128671096e-08, "advantage_min": -0.7582220807671547, "advantage_std": 0.7382465023547411, "completion_length": 2680.7083587646484, "epoch": 0.27085714285714285, "grad_norm": 0.6276447772979736, "kl": 0.2998046875, "lambda_div_used": 0.6, "learning_rate": 6.679851303883891e-07, "loss": 0.068, "reward": 0.12813042849302292, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12813042849302292, "reward_after_std": 0.7382464874535799, "reward_before_mean": 0.5997425927780569, "reward_before_std": 0.7308066599071026, "reward_change_max": 0.0017982348799705505, "reward_change_mean": -0.4716122280806303, "reward_change_min": -0.8231159038841724, "reward_change_std": 0.35379940923303366, "reward_std": 0.7382465451955795, "rewards/cosine_scaled_reward": 0.029037967324256897, "rewards/format_reward": 0.5416666772216558, "step": 237 }, { "advantage_max": 1.7318901792168617, "advantage_mean": -1.9247333560290514e-08, "advantage_min": -0.9657696727663279, "advantage_std": 0.9731025137007236, "completion_length": 2463.729232788086, "epoch": 0.272, "grad_norm": 0.9471742510795593, "kl": 0.32684326171875, "lambda_div_used": 0.6, "learning_rate": 6.649505910711058e-07, "loss": 0.0699, "reward": 0.355283772572875, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.355283772572875, "reward_after_std": 0.9731024987995625, "reward_before_mean": 0.9031340063083917, "reward_before_std": 0.9712144713848829, "reward_change_max": 0.0004456937313079834, "reward_change_mean": -0.5478502493351698, "reward_change_min": -1.0639617405831814, "reward_change_std": 0.41797661781311035, "reward_std": 0.9731025658547878, "rewards/cosine_scaled_reward": 0.08698366861790419, "rewards/format_reward": 0.7291666846722364, "step": 238 }, { "advantage_max": 1.5018570870161057, "advantage_mean": -2.0489097085629737e-08, "advantage_min": -0.7982526607811451, "advantage_std": 0.8127573877573013, "completion_length": 2166.1666984558105, "epoch": 0.27314285714285713, "grad_norm": 0.36692744493484497, "kl": 0.2796630859375, "lambda_div_used": 0.6, "learning_rate": 6.619104492241847e-07, "loss": 0.03, "reward": 0.2998898196965456, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2998898196965456, "reward_after_std": 0.8127573877573013, "reward_before_mean": 0.8436064720153809, "reward_before_std": 0.738035224378109, "reward_change_max": 0.0012630298733711243, "reward_change_mean": -0.5437166802585125, "reward_change_min": -0.9020101800560951, "reward_change_std": 0.3636026941239834, "reward_std": 0.8127574101090431, "rewards/cosine_scaled_reward": 0.1613865476101637, "rewards/format_reward": 0.5208333376795053, "step": 239 }, { "advantage_max": 0.97071772813797, "advantage_mean": 1.7384688466570708e-08, "advantage_min": -0.4119010157883167, "advantage_std": 0.5121559863910079, "completion_length": 3031.812515258789, "epoch": 0.2742857142857143, "grad_norm": 0.821390688419342, "kl": 0.65576171875, "lambda_div_used": 0.6, "learning_rate": 6.588648530198504e-07, "loss": 0.0514, "reward": -0.32652486581355333, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.32652486581355333, "reward_after_std": 0.5121559603139758, "reward_before_mean": -0.07132751506287605, "reward_before_std": 0.4690159521996975, "reward_change_max": 0.009721644222736359, "reward_change_mean": -0.25519734993577003, "reward_change_min": -0.4614217281341553, "reward_change_std": 0.1805514907464385, "reward_std": 0.512155975215137, "rewards/cosine_scaled_reward": -0.22316376119852066, "rewards/format_reward": 0.3750000037252903, "step": 240 }, { "advantage_max": 1.0398504100739956, "advantage_mean": 2.7939678126642775e-08, "advantage_min": -0.5276342928409576, "advantage_std": 0.5748845934867859, "completion_length": 3315.875, "epoch": 0.2754285714285714, "grad_norm": 0.49699658155441284, "kl": 0.58642578125, "lambda_div_used": 0.6, "learning_rate": 6.558139508961654e-07, "loss": 0.062, "reward": -0.4127648015273735, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4127648015273735, "reward_after_std": 0.5748845934867859, "reward_before_mean": -0.21113576227799058, "reward_before_std": 0.6055608876049519, "reward_change_max": 0.0006798282265663147, "reward_change_mean": -0.20162902306765318, "reward_change_min": -0.4717538245022297, "reward_change_std": 0.19875234365463257, "reward_std": 0.5748846232891083, "rewards/cosine_scaled_reward": -0.22015121672302485, "rewards/format_reward": 0.22916667349636555, "step": 241 }, { "advantage_max": 1.3193005844950676, "advantage_mean": -9.934107481068821e-09, "advantage_min": -0.48087476566433907, "advantage_std": 0.671697337180376, "completion_length": 2253.812545776367, "epoch": 0.2765714285714286, "grad_norm": 0.6630571484565735, "kl": 0.50335693359375, "lambda_div_used": 0.6, "learning_rate": 6.527578915497951e-07, "loss": 0.0349, "reward": 0.1387800257652998, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1387800257652998, "reward_after_std": 0.6716973129659891, "reward_before_mean": 0.6152276992797852, "reward_before_std": 0.5239123981446028, "reward_change_max": 0.0019446909427642822, "reward_change_mean": -0.4764476642012596, "reward_change_min": -0.7161272764205933, "reward_change_std": 0.27618725039064884, "reward_std": 0.6716973222792149, "rewards/cosine_scaled_reward": -0.046552833169698715, "rewards/format_reward": 0.7083333395421505, "step": 242 }, { "advantage_max": 1.5651024878025055, "advantage_mean": -2.6697914767837005e-08, "advantage_min": -0.8467311263084412, "advantage_std": 0.8558754622936249, "completion_length": 2888.9376068115234, "epoch": 0.2777142857142857, "grad_norm": 0.5963757038116455, "kl": 0.505859375, "lambda_div_used": 0.6, "learning_rate": 6.496968239287603e-07, "loss": 0.0682, "reward": 0.2457761913537979, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2457761913537979, "reward_after_std": 0.8558754920959473, "reward_before_mean": 0.7540382880251855, "reward_before_std": 0.8090751152485609, "reward_change_max": 0.0006944984197616577, "reward_change_mean": -0.5082621518522501, "reward_change_min": -0.9171033464372158, "reward_change_std": 0.3666645511984825, "reward_std": 0.8558755144476891, "rewards/cosine_scaled_reward": 0.11660249065607786, "rewards/format_reward": 0.520833345130086, "step": 243 }, { "advantage_max": 1.2792055383324623, "advantage_mean": 9.934107758624577e-09, "advantage_min": -0.4830026626586914, "advantage_std": 0.663687277585268, "completion_length": 2899.166717529297, "epoch": 0.27885714285714286, "grad_norm": 0.6811074614524841, "kl": 0.5177001953125, "lambda_div_used": 0.6, "learning_rate": 6.466308972251785e-07, "loss": 0.0415, "reward": 0.08414949290454388, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08414949290454388, "reward_after_std": 0.663687277585268, "reward_before_mean": 0.5329303331673145, "reward_before_std": 0.5415982883423567, "reward_change_max": 0.0, "reward_change_mean": -0.4487808058038354, "reward_change_min": -0.6938077807426453, "reward_change_std": 0.2719251224771142, "reward_std": 0.6636872962117195, "rewards/cosine_scaled_reward": 0.08938181702978909, "rewards/format_reward": 0.3541666679084301, "step": 244 }, { "advantage_max": 1.0939532667398453, "advantage_mean": 5.551115123125783e-17, "advantage_min": -0.47427595779299736, "advantage_std": 0.6055382005870342, "completion_length": 3353.0625610351562, "epoch": 0.28, "grad_norm": 0.5311272740364075, "kl": 0.607177734375, "lambda_div_used": 0.6, "learning_rate": 6.435602608679916e-07, "loss": 0.0564, "reward": -0.3409585952758789, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3409585952758789, "reward_after_std": 0.605538222938776, "reward_before_mean": -0.10389573313295841, "reward_before_std": 0.6280820369720459, "reward_change_max": 1.564621925354004e-07, "reward_change_mean": -0.2370628654025495, "reward_change_min": -0.6031081043183804, "reward_change_std": 0.2207367429509759, "reward_std": 0.6055382266640663, "rewards/cosine_scaled_reward": -0.17694786563515663, "rewards/format_reward": 0.25000000931322575, "step": 245 }, { "advantage_max": 1.2697142958641052, "advantage_mean": -1.6763806509612067e-08, "advantage_min": -0.5935459956526756, "advantage_std": 0.6914196014404297, "completion_length": 3002.0834045410156, "epoch": 0.28114285714285714, "grad_norm": 0.4564787745475769, "kl": 0.54296875, "lambda_div_used": 0.6, "learning_rate": 6.404850645156841e-07, "loss": 0.0736, "reward": 0.003791339695453644, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.003791339695453644, "reward_after_std": 0.6914196237921715, "reward_before_mean": 0.4101965553127229, "reward_before_std": 0.6638990417122841, "reward_change_max": 0.0008903145790100098, "reward_change_mean": -0.4064051969908178, "reward_change_min": -0.7757539190351963, "reward_change_std": 0.2804739885032177, "reward_std": 0.6914196386933327, "rewards/cosine_scaled_reward": -0.03448507562279701, "rewards/format_reward": 0.47916667349636555, "step": 246 }, { "advantage_max": 1.4313408359885216, "advantage_mean": 9.002785295031401e-09, "advantage_min": -0.6096308752894402, "advantage_std": 0.7550412714481354, "completion_length": 3272.1875610351562, "epoch": 0.2822857142857143, "grad_norm": 0.43419909477233887, "kl": 0.541259765625, "lambda_div_used": 0.6, "learning_rate": 6.374054580489873e-07, "loss": 0.0428, "reward": -0.21265746047720313, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.21265746047720313, "reward_after_std": 0.7550412565469742, "reward_before_mean": 0.052691347897052765, "reward_before_std": 0.7353229857981205, "reward_change_max": 0.001304030418395996, "reward_change_mean": -0.26534880325198174, "reward_change_min": -0.5918116085231304, "reward_change_std": 0.22167464904487133, "reward_std": 0.755041278898716, "rewards/cosine_scaled_reward": -0.17157100839540362, "rewards/format_reward": 0.3958333469927311, "step": 247 }, { "advantage_max": 1.2454994097352028, "advantage_mean": -4.967053435223079e-09, "advantage_min": -0.6300271227955818, "advantage_std": 0.6934917159378529, "completion_length": 2788.979217529297, "epoch": 0.2834285714285714, "grad_norm": 0.46329057216644287, "kl": 0.4290771484375, "lambda_div_used": 0.6, "learning_rate": 6.343215915635761e-07, "loss": 0.0367, "reward": 0.034394118934869766, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.034394118934869766, "reward_after_std": 0.693491704761982, "reward_before_mean": 0.46039974445011467, "reward_before_std": 0.6687365509569645, "reward_change_max": 0.0004130154848098755, "reward_change_mean": -0.4260056307539344, "reward_change_min": -0.7904873788356781, "reward_change_std": 0.3156164027750492, "reward_std": 0.6934917382895947, "rewards/cosine_scaled_reward": 0.021866535767912865, "rewards/format_reward": 0.4166666716337204, "step": 248 }, { "advantage_max": 1.774680495262146, "advantage_mean": -2.359350537162186e-08, "advantage_min": -0.7601935900747776, "advantage_std": 0.9341864809393883, "completion_length": 2269.8750381469727, "epoch": 0.2845714285714286, "grad_norm": 0.7354264855384827, "kl": 0.310791015625, "lambda_div_used": 0.6, "learning_rate": 6.31233615362752e-07, "loss": 0.0632, "reward": 0.21180204581469297, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21180204581469297, "reward_after_std": 0.9341864800080657, "reward_before_mean": 0.677543967962265, "reward_before_std": 0.8651815243065357, "reward_change_max": 0.0007881596684455872, "reward_change_mean": -0.4657419379800558, "reward_change_min": -0.765058133751154, "reward_change_std": 0.31003841245546937, "reward_std": 0.934186520986259, "rewards/cosine_scaled_reward": 0.047105309553444386, "rewards/format_reward": 0.5833333414047956, "step": 249 }, { "advantage_max": 1.3317992761731148, "advantage_mean": 8.071462387349015e-09, "advantage_min": -0.6089577861130238, "advantage_std": 0.7215257622301579, "completion_length": 2671.8959045410156, "epoch": 0.2857142857142857, "grad_norm": 0.6826204061508179, "kl": 0.4267578125, "lambda_div_used": 0.6, "learning_rate": 6.281416799501187e-07, "loss": 0.0164, "reward": -0.07462250138632953, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.07462250138632953, "reward_after_std": 0.7215257547795773, "reward_before_mean": 0.27664079144597054, "reward_before_std": 0.7012166492640972, "reward_change_max": 0.0021498724818229675, "reward_change_mean": -0.3512632828205824, "reward_change_min": -0.7547120712697506, "reward_change_std": 0.27203070372343063, "reward_std": 0.7215257957577705, "rewards/cosine_scaled_reward": -0.17417962139006704, "rewards/format_reward": 0.6250000074505806, "step": 250 }, { "advantage_max": 1.5387679040431976, "advantage_mean": -9.93410786964688e-09, "advantage_min": -0.7006419822573662, "advantage_std": 0.8206590004265308, "completion_length": 2208.729202270508, "epoch": 0.28685714285714287, "grad_norm": 0.6821022033691406, "kl": 0.33001708984375, "lambda_div_used": 0.6, "learning_rate": 6.25045936022246e-07, "loss": 0.0084, "reward": 0.1401051990687847, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1401051990687847, "reward_after_std": 0.8206590078771114, "reward_before_mean": 0.5900087499758229, "reward_before_std": 0.7465699054300785, "reward_change_max": 0.0010773837566375732, "reward_change_mean": -0.4499035747721791, "reward_change_min": -0.8058441616594791, "reward_change_std": 0.3140787845477462, "reward_std": 0.820659015327692, "rewards/cosine_scaled_reward": -0.027912288904190063, "rewards/format_reward": 0.6458333395421505, "step": 251 }, { "advantage_max": 1.1559683978557587, "advantage_mean": -1.2417644135176431e-09, "advantage_min": -0.5106577202677727, "advantage_std": 0.6102561987936497, "completion_length": 3031.6041870117188, "epoch": 0.288, "grad_norm": 0.283564954996109, "kl": 0.33856201171875, "lambda_div_used": 0.6, "learning_rate": 6.219465344613258e-07, "loss": 0.0426, "reward": -0.11627742386190221, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11627742386190221, "reward_after_std": 0.6102561987936497, "reward_before_mean": 0.23478730767965317, "reward_before_std": 0.5429081320762634, "reward_change_max": 0.000623457133769989, "reward_change_mean": -0.35106473602354527, "reward_change_min": -0.6218222491443157, "reward_change_std": 0.2379716858267784, "reward_std": 0.6102562211453915, "rewards/cosine_scaled_reward": -0.09093968477100134, "rewards/format_reward": 0.41666667349636555, "step": 252 }, { "advantage_max": 1.329535834491253, "advantage_mean": -1.6763806454100916e-08, "advantage_min": -0.8195891305804253, "advantage_std": 0.7661233134567738, "completion_length": 2583.166732788086, "epoch": 0.28914285714285715, "grad_norm": 0.5934932231903076, "kl": 0.27423095703125, "lambda_div_used": 0.6, "learning_rate": 6.188436263278172e-07, "loss": 0.0548, "reward": 0.16420308127999306, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16420308127999306, "reward_after_std": 0.7661233209073544, "reward_before_mean": 0.652702329447493, "reward_before_std": 0.7853073924779892, "reward_change_max": 0.0004984140396118164, "reward_change_mean": -0.4884992679581046, "reward_change_min": -0.8899687603116035, "reward_change_std": 0.35754732973873615, "reward_std": 0.7661233320832253, "rewards/cosine_scaled_reward": -0.017398834694176912, "rewards/format_reward": 0.6875000111758709, "step": 253 }, { "advantage_max": 1.16770701110363, "advantage_mean": 1.6142925107764938e-08, "advantage_min": -0.5569769516587257, "advantage_std": 0.6323312036693096, "completion_length": 3085.729217529297, "epoch": 0.29028571428571426, "grad_norm": 0.36022013425827026, "kl": 0.3563232421875, "lambda_div_used": 0.6, "learning_rate": 6.157373628530852e-07, "loss": 0.0417, "reward": -0.24699397385120392, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.24699397385120392, "reward_after_std": 0.6323312278836966, "reward_before_mean": 0.030835852958261967, "reward_before_std": 0.621703639626503, "reward_change_max": 0.003024943172931671, "reward_change_mean": -0.2778298296034336, "reward_change_min": -0.5625698752701283, "reward_change_std": 0.2203113967552781, "reward_std": 0.6323312316089869, "rewards/cosine_scaled_reward": -0.16166540794074535, "rewards/format_reward": 0.35416667349636555, "step": 254 }, { "advantage_max": 1.2296145930886269, "advantage_mean": -6.208814573582799e-10, "advantage_min": -0.6068962588906288, "advantage_std": 0.6709359288215637, "completion_length": 3051.7083892822266, "epoch": 0.2914285714285714, "grad_norm": 0.3220326006412506, "kl": 0.35479736328125, "lambda_div_used": 0.6, "learning_rate": 6.126278954320294e-07, "loss": 0.0296, "reward": -0.29262065328657627, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.29262065328657627, "reward_after_std": 0.6709359437227249, "reward_before_mean": -0.0455952831543982, "reward_before_std": 0.6920541971921921, "reward_change_max": 0.0018450617790222168, "reward_change_mean": -0.24702537804841995, "reward_change_min": -0.5441051162779331, "reward_change_std": 0.21536783035844564, "reward_std": 0.6709359511733055, "rewards/cosine_scaled_reward": -0.16863098926842213, "rewards/format_reward": 0.2916666716337204, "step": 255 }, { "advantage_max": 1.2679320387542248, "advantage_mean": 1.7384688910659918e-08, "advantage_min": -0.7408056110143661, "advantage_std": 0.7108086459338665, "completion_length": 3014.812530517578, "epoch": 0.2925714285714286, "grad_norm": 0.47123652696609497, "kl": 0.3238525390625, "lambda_div_used": 0.6, "learning_rate": 6.095153756157051e-07, "loss": 0.0183, "reward": -0.056694136932492256, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.056694136932492256, "reward_after_std": 0.710808627307415, "reward_before_mean": 0.31453731283545494, "reward_before_std": 0.7238197699189186, "reward_change_max": 0.0021554455161094666, "reward_change_mean": -0.3712314344011247, "reward_change_min": -0.6968014165759087, "reward_change_std": 0.28223771415650845, "reward_std": 0.7108086608350277, "rewards/cosine_scaled_reward": -0.06148135010153055, "rewards/format_reward": 0.43750002048909664, "step": 256 }, { "advantage_max": 1.7293520271778107, "advantage_mean": -1.4280279847511679e-08, "advantage_min": -0.8434292376041412, "advantage_std": 0.9644281379878521, "completion_length": 3112.6875610351562, "epoch": 0.2937142857142857, "grad_norm": 1.1420844793319702, "kl": 0.31341552734375, "lambda_div_used": 0.6, "learning_rate": 6.06399955103937e-07, "loss": 0.0405, "reward": 0.11980674788355827, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11980674788355827, "reward_after_std": 0.9644281603395939, "reward_before_mean": 0.541266412474215, "reward_before_std": 0.9871671870350838, "reward_change_max": 0.0004280358552932739, "reward_change_mean": -0.4214596524834633, "reward_change_min": -0.8742838054895401, "reward_change_std": 0.36773448437452316, "reward_std": 0.9644281938672066, "rewards/cosine_scaled_reward": 0.041466531343758106, "rewards/format_reward": 0.4583333469927311, "step": 257 }, { "advantage_max": 1.308477059006691, "advantage_mean": -8.071462664904772e-09, "advantage_min": -0.6776211000978947, "advantage_std": 0.7201198823750019, "completion_length": 3318.041717529297, "epoch": 0.2948571428571429, "grad_norm": 0.5514835715293884, "kl": 0.3533935546875, "lambda_div_used": 0.6, "learning_rate": 6.032817857379256e-07, "loss": 0.0511, "reward": -0.1533919759094715, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1533919759094715, "reward_after_std": 0.7201199196279049, "reward_before_mean": 0.16029761778190732, "reward_before_std": 0.7385069392621517, "reward_change_max": 0.0006575882434844971, "reward_change_mean": -0.31368961185216904, "reward_change_min": -0.6525617055594921, "reward_change_std": 0.2573896599933505, "reward_std": 0.7201199345290661, "rewards/cosine_scaled_reward": -0.09693453786894679, "rewards/format_reward": 0.354166679084301, "step": 258 }, { "advantage_max": 1.3717726543545723, "advantage_mean": -1.6142924996742636e-08, "advantage_min": -0.6455200538039207, "advantage_std": 0.7451611235737801, "completion_length": 2742.0833892822266, "epoch": 0.296, "grad_norm": 0.6136261224746704, "kl": 0.32659912109375, "lambda_div_used": 0.6, "learning_rate": 6.001610194928464e-07, "loss": 0.049, "reward": 0.19095646031200886, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19095646031200886, "reward_after_std": 0.7451611235737801, "reward_before_mean": 0.6913169464096427, "reward_before_std": 0.6776915155351162, "reward_change_max": 0.0005735307931900024, "reward_change_mean": -0.500360487960279, "reward_change_min": -0.9127090983092785, "reward_change_std": 0.34858130011707544, "reward_std": 0.7451611422002316, "rewards/cosine_scaled_reward": 0.06440844899043441, "rewards/format_reward": 0.5625000074505806, "step": 259 }, { "advantage_max": 1.6590567827224731, "advantage_mean": 7.45058070794613e-09, "advantage_min": -0.7553113959729671, "advantage_std": 0.9088802076876163, "completion_length": 2397.645881652832, "epoch": 0.29714285714285715, "grad_norm": 0.7651255130767822, "kl": 0.28924560546875, "lambda_div_used": 0.6, "learning_rate": 5.97037808470444e-07, "loss": 0.0515, "reward": 0.3306399695575237, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3306399695575237, "reward_after_std": 0.908880215138197, "reward_before_mean": 0.8759937360882759, "reward_before_std": 0.861339095979929, "reward_change_max": 0.0006400644779205322, "reward_change_mean": -0.545353771187365, "reward_change_min": -1.0057422816753387, "reward_change_std": 0.39418873470276594, "reward_std": 0.9088802374899387, "rewards/cosine_scaled_reward": 0.1567468661814928, "rewards/format_reward": 0.5625000037252903, "step": 260 }, { "advantage_max": 1.0520955845713615, "advantage_mean": 1.490116219304838e-08, "advantage_min": -0.5371618010103703, "advantage_std": 0.5718340016901493, "completion_length": 3145.166717529297, "epoch": 0.29828571428571427, "grad_norm": 0.48807358741760254, "kl": 0.3935546875, "lambda_div_used": 0.6, "learning_rate": 5.939123048916173e-07, "loss": 0.0241, "reward": -0.15376926213502884, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.15376926213502884, "reward_after_std": 0.5718340091407299, "reward_before_mean": 0.1892886906862259, "reward_before_std": 0.5439359992742538, "reward_change_max": 0.000623844563961029, "reward_change_mean": -0.3430579248815775, "reward_change_min": -0.5786691717803478, "reward_change_std": 0.2355566336773336, "reward_std": 0.5718340240418911, "rewards/cosine_scaled_reward": -0.1032723356038332, "rewards/format_reward": 0.3958333358168602, "step": 261 }, { "advantage_max": 1.3386315703392029, "advantage_mean": 1.1796753240922442e-08, "advantage_min": -0.6328083500266075, "advantage_std": 0.7234783992171288, "completion_length": 3038.437545776367, "epoch": 0.29942857142857143, "grad_norm": 0.9701263904571533, "kl": 0.442626953125, "lambda_div_used": 0.6, "learning_rate": 5.907846610890011e-07, "loss": 0.096, "reward": -0.25120984390378, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.25120984390378, "reward_after_std": 0.7234784215688705, "reward_before_mean": 0.006802541669458151, "reward_before_std": 0.7268307991325855, "reward_change_max": 0.0003486126661300659, "reward_change_mean": -0.2580123767256737, "reward_change_min": -0.5682838223874569, "reward_change_std": 0.23459640704095364, "reward_std": 0.7234784476459026, "rewards/cosine_scaled_reward": -0.2257654066197574, "rewards/format_reward": 0.45833333767950535, "step": 262 }, { "advantage_max": 1.17459299787879, "advantage_mean": -1.2417633588057697e-09, "advantage_min": -0.5453278385102749, "advantage_std": 0.6335834860801697, "completion_length": 2903.5625534057617, "epoch": 0.30057142857142854, "grad_norm": 0.4829193949699402, "kl": 0.52587890625, "lambda_div_used": 0.6, "learning_rate": 5.87655029499542e-07, "loss": 0.0646, "reward": -0.29326684278203174, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.29326684278203174, "reward_after_std": 0.6335834860801697, "reward_before_mean": -0.041429828852415085, "reward_before_std": 0.6316493209451437, "reward_change_max": 0.00026198476552963257, "reward_change_mean": -0.2518370160833001, "reward_change_min": -0.5603325888514519, "reward_change_std": 0.21581022161990404, "reward_std": 0.6335835084319115, "rewards/cosine_scaled_reward": -0.18738158675841987, "rewards/format_reward": 0.33333333767950535, "step": 263 }, { "advantage_max": 1.300993226468563, "advantage_mean": 4.346172144398253e-09, "advantage_min": -0.6545535698533058, "advantage_std": 0.724092248827219, "completion_length": 3088.1250610351562, "epoch": 0.3017142857142857, "grad_norm": 0.46063876152038574, "kl": 0.520751953125, "lambda_div_used": 0.6, "learning_rate": 5.845235626570683e-07, "loss": 0.0537, "reward": -0.05797311244532466, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05797311244532466, "reward_after_std": 0.7240922451019287, "reward_before_mean": 0.31046564131975174, "reward_before_std": 0.7186985239386559, "reward_change_max": 0.0, "reward_change_mean": -0.3684387691318989, "reward_change_min": -0.723139800131321, "reward_change_std": 0.2936291787773371, "reward_std": 0.724092248827219, "rewards/cosine_scaled_reward": -0.13643384957686067, "rewards/format_reward": 0.5833333469927311, "step": 264 }, { "advantage_max": 1.6109931096434593, "advantage_mean": -1.552204331733975e-08, "advantage_min": -0.7705529257655144, "advantage_std": 0.86626897752285, "completion_length": 2894.708450317383, "epoch": 0.3028571428571429, "grad_norm": 0.9522861838340759, "kl": 0.5997314453125, "lambda_div_used": 0.6, "learning_rate": 5.813904131848564e-07, "loss": 0.0947, "reward": 0.16067335568368435, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16067335568368435, "reward_after_std": 0.86626897752285, "reward_before_mean": 0.6178323356434703, "reward_before_std": 0.8223806396126747, "reward_change_max": 0.0, "reward_change_mean": -0.4571589883416891, "reward_change_min": -0.8531933054327965, "reward_change_std": 0.3247545287013054, "reward_std": 0.8662689924240112, "rewards/cosine_scaled_reward": 0.027666167356073856, "rewards/format_reward": 0.5625000186264515, "step": 265 }, { "advantage_max": 1.2452225089073181, "advantage_mean": 1.1175871117430347e-08, "advantage_min": -0.6117013469338417, "advantage_std": 0.6951426491141319, "completion_length": 3169.5833892822266, "epoch": 0.304, "grad_norm": 1.1230192184448242, "kl": 0.6510009765625, "lambda_div_used": 0.6, "learning_rate": 5.78255733788191e-07, "loss": 0.0369, "reward": -0.2725262697786093, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2725262697786093, "reward_after_std": 0.6951426453888416, "reward_before_mean": -0.015931952744722366, "reward_before_std": 0.7379988580942154, "reward_change_max": 0.0018127188086509705, "reward_change_mean": -0.25659431144595146, "reward_change_min": -0.6191375590860844, "reward_change_std": 0.25174527056515217, "reward_std": 0.6951426677405834, "rewards/cosine_scaled_reward": -0.22671598196029663, "rewards/format_reward": 0.43750000558793545, "step": 266 }, { "advantage_max": 0.9989327527582645, "advantage_mean": 6.208818459363386e-10, "advantage_min": -0.480878084897995, "advantage_std": 0.5396589785814285, "completion_length": 3455.2500610351562, "epoch": 0.30514285714285716, "grad_norm": 0.9501419067382812, "kl": 0.77880859375, "lambda_div_used": 0.6, "learning_rate": 5.751196772469237e-07, "loss": 0.0558, "reward": -0.4211684428155422, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4211684428155422, "reward_after_std": 0.5396589823067188, "reward_before_mean": -0.21912843873724341, "reward_before_std": 0.5446715801954269, "reward_change_max": 0.0012646690011024475, "reward_change_mean": -0.2020400082692504, "reward_change_min": -0.45151586830616, "reward_change_std": 0.17916888557374477, "reward_std": 0.5396590121090412, "rewards/cosine_scaled_reward": -0.23456422612071037, "rewards/format_reward": 0.2500000037252903, "step": 267 }, { "advantage_max": 1.4540027901530266, "advantage_mean": 1.2728075704515618e-08, "advantage_min": -0.6779943779110909, "advantage_std": 0.7723269909620285, "completion_length": 2636.6458740234375, "epoch": 0.3062857142857143, "grad_norm": 0.7164831757545471, "kl": 0.58544921875, "lambda_div_used": 0.6, "learning_rate": 5.71982396408026e-07, "loss": 0.0633, "reward": -0.14749359339475632, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.14749359339475632, "reward_after_std": 0.7723269797861576, "reward_before_mean": 0.15347140841186047, "reward_before_std": 0.7567279189825058, "reward_change_max": 0.0001808255910873413, "reward_change_mean": -0.30096501484513283, "reward_change_min": -0.5972777009010315, "reward_change_std": 0.23437961423769593, "reward_std": 0.7723270021378994, "rewards/cosine_scaled_reward": -0.16284763207659125, "rewards/format_reward": 0.47916668467223644, "step": 268 }, { "advantage_max": 1.5683661699295044, "advantage_mean": -1.117587122845265e-08, "advantage_min": -0.6702335737645626, "advantage_std": 0.8368349559605122, "completion_length": 2962.2709197998047, "epoch": 0.30742857142857144, "grad_norm": 0.7401025295257568, "kl": 0.635498046875, "lambda_div_used": 0.6, "learning_rate": 5.688440441781398e-07, "loss": 0.079, "reward": -0.021501684561371803, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.021501684561371803, "reward_after_std": 0.8368349559605122, "reward_before_mean": 0.3379681808874011, "reward_before_std": 0.8070851732045412, "reward_change_max": 0.0007351338863372803, "reward_change_mean": -0.359469898045063, "reward_change_min": -0.7419305182993412, "reward_change_std": 0.2838026713579893, "reward_std": 0.8368349634110928, "rewards/cosine_scaled_reward": -0.1435159114189446, "rewards/format_reward": 0.6250000074505806, "step": 269 }, { "advantage_max": 1.7115082815289497, "advantage_mean": 2.483527050678447e-09, "advantage_min": -0.7837255597114563, "advantage_std": 0.9412933625280857, "completion_length": 2904.6459045410156, "epoch": 0.30857142857142855, "grad_norm": 0.807169497013092, "kl": 0.6158447265625, "lambda_div_used": 0.6, "learning_rate": 5.657047735161255e-07, "loss": 0.0492, "reward": 0.009002413600683212, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.009002413600683212, "reward_after_std": 0.9412933364510536, "reward_before_mean": 0.3679821668192744, "reward_before_std": 0.9656250663101673, "reward_change_max": 0.0015122592449188232, "reward_change_mean": -0.35897977463901043, "reward_change_min": -0.8852769210934639, "reward_change_std": 0.3282699631527066, "reward_std": 0.9412933550775051, "rewards/cosine_scaled_reward": -0.07642558356747031, "rewards/format_reward": 0.5208333414047956, "step": 270 }, { "advantage_max": 1.5734351687133312, "advantage_mean": -2.2972623636707823e-08, "advantage_min": -0.7996720634400845, "advantage_std": 0.8608956038951874, "completion_length": 2784.729217529297, "epoch": 0.3097142857142857, "grad_norm": 1.0408498048782349, "kl": 0.579833984375, "lambda_div_used": 0.6, "learning_rate": 5.625647374256061e-07, "loss": 0.0876, "reward": 0.2030478809028864, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2030478809028864, "reward_after_std": 0.8608955889940262, "reward_before_mean": 0.6846325844526291, "reward_before_std": 0.835495188832283, "reward_change_max": 0.0, "reward_change_mean": -0.48158473148941994, "reward_change_min": -0.8558024764060974, "reward_change_std": 0.33973479084670544, "reward_std": 0.8608956038951874, "rewards/cosine_scaled_reward": 0.019399608485400677, "rewards/format_reward": 0.6458333432674408, "step": 271 }, { "advantage_max": 1.3714451864361763, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.7535846754908562, "advantage_std": 0.7635341472923756, "completion_length": 3308.250030517578, "epoch": 0.31085714285714283, "grad_norm": 0.6424413323402405, "kl": 0.76318359375, "lambda_div_used": 0.6, "learning_rate": 5.594240889475106e-07, "loss": 0.0771, "reward": -0.20335895102471113, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.20335895102471113, "reward_after_std": 0.7635341547429562, "reward_before_mean": 0.07921543845441192, "reward_before_std": 0.8059300296008587, "reward_change_max": 0.0013331100344657898, "reward_change_mean": -0.28257441613823175, "reward_change_min": -0.6223890446126461, "reward_change_std": 0.2657231818884611, "reward_std": 0.7635341696441174, "rewards/cosine_scaled_reward": -0.14789227582514286, "rewards/format_reward": 0.3750000111758709, "step": 272 }, { "advantage_max": 1.3589064106345177, "advantage_mean": 4.346172699509765e-09, "advantage_min": -0.7836330458521843, "advantage_std": 0.7793318666517735, "completion_length": 3080.5209045410156, "epoch": 0.312, "grad_norm": 0.7329338192939758, "kl": 0.748291015625, "lambda_div_used": 0.6, "learning_rate": 5.562829811526154e-07, "loss": 0.0901, "reward": -0.043059684336185455, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.043059684336185455, "reward_after_std": 0.7793318778276443, "reward_before_mean": 0.328045935370028, "reward_before_std": 0.8345182836055756, "reward_change_max": 0.0015919134020805359, "reward_change_mean": -0.3711056038737297, "reward_change_min": -0.7348584309220314, "reward_change_std": 0.31267216615378857, "reward_std": 0.7793319001793861, "rewards/cosine_scaled_reward": -0.07556037977337837, "rewards/format_reward": 0.47916667722165585, "step": 273 }, { "advantage_max": 1.5300347954034805, "advantage_mean": -2.0489097418696645e-08, "advantage_min": -0.8000783994793892, "advantage_std": 0.8445150479674339, "completion_length": 2011.8333587646484, "epoch": 0.31314285714285717, "grad_norm": 1.2785873413085938, "kl": 0.658935546875, "lambda_div_used": 0.6, "learning_rate": 5.531415671340826e-07, "loss": -0.0116, "reward": 0.510652432218194, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.510652432218194, "reward_after_std": 0.8445150516927242, "reward_before_mean": 1.165109277702868, "reward_before_std": 0.757109772413969, "reward_change_max": 0.000974707305431366, "reward_change_mean": -0.6544568724930286, "reward_change_min": -1.0643592663109303, "reward_change_std": 0.4129680562764406, "reward_std": 0.8445150889456272, "rewards/cosine_scaled_reward": 0.21797129698097706, "rewards/format_reward": 0.7291666679084301, "step": 274 }, { "advantage_max": 1.4885611981153488, "advantage_mean": -9.934107647602275e-09, "advantage_min": -0.7650426514446735, "advantage_std": 0.8269946090877056, "completion_length": 2577.875030517578, "epoch": 0.3142857142857143, "grad_norm": 0.6695863604545593, "kl": 0.6025390625, "lambda_div_used": 0.6, "learning_rate": 5.5e-07, "loss": 0.0346, "reward": 0.1038482214207761, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1038482214207761, "reward_after_std": 0.8269945941865444, "reward_before_mean": 0.5431328900158405, "reward_before_std": 0.8103073462843895, "reward_change_max": 0.0015599504113197327, "reward_change_mean": -0.4392846738919616, "reward_change_min": -0.8281485103070736, "reward_change_std": 0.3419040869921446, "reward_std": 0.8269945941865444, "rewards/cosine_scaled_reward": -0.009683551266789436, "rewards/format_reward": 0.5625000093132257, "step": 275 }, { "advantage_max": 1.7513000518083572, "advantage_mean": -1.8626451658843024e-08, "advantage_min": -0.7792819105088711, "advantage_std": 0.9374289289116859, "completion_length": 2389.416717529297, "epoch": 0.31542857142857145, "grad_norm": 0.5462023019790649, "kl": 0.61199951171875, "lambda_div_used": 0.6, "learning_rate": 5.468584328659172e-07, "loss": 0.0499, "reward": 0.23297729715704918, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23297729715704918, "reward_after_std": 0.9374288991093636, "reward_before_mean": 0.7080944953486323, "reward_before_std": 0.8812166787683964, "reward_change_max": 0.0, "reward_change_mean": -0.4751171786338091, "reward_change_min": -0.9459218531847, "reward_change_std": 0.3399658240377903, "reward_std": 0.9374289140105247, "rewards/cosine_scaled_reward": -0.00011943886056542397, "rewards/format_reward": 0.7083333488553762, "step": 276 }, { "advantage_max": 1.5042954310774803, "advantage_mean": 1.1796753018877837e-08, "advantage_min": -0.6724752858281136, "advantage_std": 0.8150804676115513, "completion_length": 2284.187568664551, "epoch": 0.31657142857142856, "grad_norm": 0.8358162045478821, "kl": 0.46759033203125, "lambda_div_used": 0.6, "learning_rate": 5.437170188473847e-07, "loss": 0.0198, "reward": 0.021968248765915632, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.021968248765915632, "reward_after_std": 0.8150804601609707, "reward_before_mean": 0.4101370070129633, "reward_before_std": 0.7939338013529778, "reward_change_max": 0.0005772784352302551, "reward_change_mean": -0.3881687559187412, "reward_change_min": -0.8049648813903332, "reward_change_std": 0.29894091188907623, "reward_std": 0.8150804750621319, "rewards/cosine_scaled_reward": -0.11784817464649677, "rewards/format_reward": 0.6458333376795053, "step": 277 }, { "advantage_max": 1.0878044962882996, "advantage_mean": 3.570070017389071e-09, "advantage_min": -0.5774106942117214, "advantage_std": 0.5889769718050957, "completion_length": 2305.0000915527344, "epoch": 0.3177142857142857, "grad_norm": 0.7688349485397339, "kl": 0.60107421875, "lambda_div_used": 0.6, "learning_rate": 5.405759110524894e-07, "loss": 0.0413, "reward": 0.218816798646003, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.218816798646003, "reward_after_std": 0.5889769680798054, "reward_before_mean": 0.7605785094201565, "reward_before_std": 0.4702807143330574, "reward_change_max": 0.0, "reward_change_mean": -0.5417617131024599, "reward_change_min": -0.8437193483114243, "reward_change_std": 0.3238454647362232, "reward_std": 0.588976975530386, "rewards/cosine_scaled_reward": 0.015705913305282593, "rewards/format_reward": 0.7291666734963655, "step": 278 }, { "advantage_max": 1.6663006246089935, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.7244021371006966, "advantage_std": 0.8816425837576389, "completion_length": 3248.0625915527344, "epoch": 0.31885714285714284, "grad_norm": 0.9396555423736572, "kl": 0.61865234375, "lambda_div_used": 0.6, "learning_rate": 5.37435262574394e-07, "loss": 0.0357, "reward": -0.010367986280471087, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.010367986280471087, "reward_after_std": 0.8816425688564777, "reward_before_mean": 0.3451116308569908, "reward_before_std": 0.8489609006792307, "reward_change_max": 0.0029679909348487854, "reward_change_mean": -0.3554796166718006, "reward_change_min": -0.7218036316335201, "reward_change_std": 0.28672708943486214, "reward_std": 0.8816425986588001, "rewards/cosine_scaled_reward": -0.13994419388473034, "rewards/format_reward": 0.6250000074505806, "step": 279 }, { "advantage_max": 1.741914540529251, "advantage_mean": -6.208817571184966e-09, "advantage_min": -0.8550667315721512, "advantage_std": 0.9599592238664627, "completion_length": 2562.9792442321777, "epoch": 0.32, "grad_norm": 0.6332762837409973, "kl": 0.53643798828125, "lambda_div_used": 0.6, "learning_rate": 5.342952264838747e-07, "loss": 0.0627, "reward": 0.3203533738851547, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3203533738851547, "reward_after_std": 0.9599592164158821, "reward_before_mean": 0.8485358407488093, "reward_before_std": 0.9343424178659916, "reward_change_max": 0.0003201141953468323, "reward_change_mean": -0.5281824506819248, "reward_change_min": -0.9736247472465038, "reward_change_std": 0.38505643233656883, "reward_std": 0.9599592462182045, "rewards/cosine_scaled_reward": 0.10135123133659363, "rewards/format_reward": 0.6458333414047956, "step": 280 }, { "advantage_max": 0.9071239531040192, "advantage_mean": 1.1796752907855534e-08, "advantage_min": -0.46878183260560036, "advantage_std": 0.4929075203835964, "completion_length": 3433.9791870117188, "epoch": 0.3211428571428571, "grad_norm": 0.635045051574707, "kl": 0.5234375, "lambda_div_used": 0.6, "learning_rate": 5.311559558218603e-07, "loss": 0.0368, "reward": -0.4045607140287757, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.4045607140287757, "reward_after_std": 0.492907527834177, "reward_before_mean": -0.18301626667380333, "reward_before_std": 0.491706857457757, "reward_change_max": 0.0007172971963882446, "reward_change_mean": -0.2215444464236498, "reward_change_min": -0.4169187992811203, "reward_change_std": 0.17256073001772165, "reward_std": 0.4929075501859188, "rewards/cosine_scaled_reward": -0.19567479752004147, "rewards/format_reward": 0.20833334140479565, "step": 281 }, { "advantage_max": 1.310217224061489, "advantage_mean": -2.4835267176115394e-09, "advantage_min": -0.6598837561905384, "advantage_std": 0.7153121419250965, "completion_length": 2712.916732788086, "epoch": 0.3222857142857143, "grad_norm": 0.42539331316947937, "kl": 0.4349365234375, "lambda_div_used": 0.6, "learning_rate": 5.28017603591974e-07, "loss": 0.0524, "reward": 0.23194605857133865, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23194605857133865, "reward_after_std": 0.7153121344745159, "reward_before_mean": 0.7622983139008284, "reward_before_std": 0.6276755481958389, "reward_change_max": 0.0, "reward_change_mean": -0.530352272093296, "reward_change_min": -0.9107529036700726, "reward_change_std": 0.3500578925013542, "reward_std": 0.7153121344745159, "rewards/cosine_scaled_reward": 0.026982491835951805, "rewards/format_reward": 0.7083333488553762, "step": 282 }, { "advantage_max": 1.7362473383545876, "advantage_mean": -2.4835267176115394e-09, "advantage_min": -0.780534915626049, "advantage_std": 0.9419512934982777, "completion_length": 3040.5833740234375, "epoch": 0.32342857142857145, "grad_norm": 1.8998653888702393, "kl": 0.46240234375, "lambda_div_used": 0.6, "learning_rate": 5.248803227530763e-07, "loss": 0.1153, "reward": -0.013534542173147202, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.013534542173147202, "reward_after_std": 0.9419512934982777, "reward_before_mean": 0.330111525952816, "reward_before_std": 0.9573567919433117, "reward_change_max": 0.0005117952823638916, "reward_change_mean": -0.34364606719464064, "reward_change_min": -0.7443192265927792, "reward_change_std": 0.2993211802095175, "reward_std": 0.9419513083994389, "rewards/cosine_scaled_reward": -0.043277584947645664, "rewards/format_reward": 0.4166666679084301, "step": 283 }, { "advantage_max": 1.6517476364970207, "advantage_mean": -5.587935614226325e-09, "advantage_min": -0.739282701164484, "advantage_std": 0.8730613552033901, "completion_length": 2619.3125915527344, "epoch": 0.32457142857142857, "grad_norm": 0.5986530184745789, "kl": 0.50341796875, "lambda_div_used": 0.6, "learning_rate": 5.21744266211809e-07, "loss": 0.0233, "reward": 0.06523705890867859, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06523705890867859, "reward_after_std": 0.8730613254010677, "reward_before_mean": 0.46089828852564096, "reward_before_std": 0.8123955279588699, "reward_change_max": 0.0, "reward_change_mean": -0.39566121622920036, "reward_change_min": -0.7429598942399025, "reward_change_std": 0.28500969521701336, "reward_std": 0.8730613365769386, "rewards/cosine_scaled_reward": -0.11330086522502825, "rewards/format_reward": 0.6875000111758709, "step": 284 }, { "advantage_max": 1.032178670167923, "advantage_mean": 6.829699417121304e-09, "advantage_min": -0.5508750528097153, "advantage_std": 0.5619474165141582, "completion_length": 2122.6458587646484, "epoch": 0.32571428571428573, "grad_norm": 0.3667513132095337, "kl": 0.301513671875, "lambda_div_used": 0.6, "learning_rate": 5.186095868151436e-07, "loss": 0.0112, "reward": -0.03984304657205939, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.03984304657205939, "reward_after_std": 0.5619474314153194, "reward_before_mean": 0.36875259317457676, "reward_before_std": 0.5008511012420058, "reward_change_max": 0.0, "reward_change_mean": -0.40859564114362, "reward_change_min": -0.6677859574556351, "reward_change_std": 0.2631244119256735, "reward_std": 0.5619474649429321, "rewards/cosine_scaled_reward": -0.16979038482531905, "rewards/format_reward": 0.7083333414047956, "step": 285 }, { "advantage_max": 1.6204382330179214, "advantage_mean": 1.552204503818544e-09, "advantage_min": -0.7265680469572544, "advantage_std": 0.8686433807015419, "completion_length": 2619.8333587646484, "epoch": 0.32685714285714285, "grad_norm": 1.1987253427505493, "kl": 0.50897216796875, "lambda_div_used": 0.6, "learning_rate": 5.154764373429315e-07, "loss": 0.0563, "reward": -0.004736738046631217, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.004736738046631217, "reward_after_std": 0.8686433918774128, "reward_before_mean": 0.3524339944124222, "reward_before_std": 0.8589610904455185, "reward_change_max": 0.0024899691343307495, "reward_change_mean": -0.3571706861257553, "reward_change_min": -0.7682550400495529, "reward_change_std": 0.29713574796915054, "reward_std": 0.8686434105038643, "rewards/cosine_scaled_reward": -0.08419968781527132, "rewards/format_reward": 0.5208333414047956, "step": 286 }, { "advantage_max": 1.2582305893301964, "advantage_mean": -5.277494496969126e-09, "advantage_min": -0.5760709270834923, "advantage_std": 0.6693591959774494, "completion_length": 2054.291717529297, "epoch": 0.328, "grad_norm": 1.019945740699768, "kl": 0.637664794921875, "lambda_div_used": 0.6, "learning_rate": 5.123449705004581e-07, "loss": 0.004, "reward": 0.24294462392572314, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.24294462392572314, "reward_after_std": 0.6693591997027397, "reward_before_mean": 0.7795294325333089, "reward_before_std": 0.5335474237799644, "reward_change_max": 0.0, "reward_change_mean": -0.536584809422493, "reward_change_min": -0.8521362952888012, "reward_change_std": 0.32288376055657864, "reward_std": 0.6693592220544815, "rewards/cosine_scaled_reward": 0.05643137916922569, "rewards/format_reward": 0.6666666716337204, "step": 287 }, { "advantage_max": 1.6867050975561142, "advantage_mean": -1.738468857759301e-08, "advantage_min": -0.636960681527853, "advantage_std": 0.8619322814047337, "completion_length": 2832.062530517578, "epoch": 0.3291428571428571, "grad_norm": 0.4221585690975189, "kl": 0.447998046875, "lambda_div_used": 0.6, "learning_rate": 5.09215338910999e-07, "loss": 0.0462, "reward": 0.07961262296885252, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07961262296885252, "reward_after_std": 0.8619322963058949, "reward_before_mean": 0.4838328785263002, "reward_before_std": 0.7499809451401234, "reward_change_max": 0.0010158568620681763, "reward_change_mean": -0.40422030352056026, "reward_change_min": -0.6797929219901562, "reward_change_std": 0.26082053780555725, "reward_std": 0.8619323261082172, "rewards/cosine_scaled_reward": -0.06016687932424247, "rewards/format_reward": 0.6041666828095913, "step": 288 }, { "advantage_max": 1.3328250646591187, "advantage_mean": 1.179675274132208e-08, "advantage_min": -0.6627430729568005, "advantage_std": 0.7312614023685455, "completion_length": 2030.6042098999023, "epoch": 0.3302857142857143, "grad_norm": 1.2222105264663696, "kl": 0.337738037109375, "lambda_div_used": 0.6, "learning_rate": 5.060876951083828e-07, "loss": -0.0151, "reward": 0.18689546827226877, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18689546827226877, "reward_after_std": 0.7312614023685455, "reward_before_mean": 0.6872250400483608, "reward_before_std": 0.6786364018917084, "reward_change_max": 0.0, "reward_change_mean": -0.5003295410424471, "reward_change_min": -0.8750745318830013, "reward_change_std": 0.3380218371748924, "reward_std": 0.7312614247202873, "rewards/cosine_scaled_reward": 0.02069583162665367, "rewards/format_reward": 0.6458333376795053, "step": 289 }, { "advantage_max": 1.6091022714972496, "advantage_mean": 4.346171977864799e-09, "advantage_min": -0.7964055761694908, "advantage_std": 0.8811142966151237, "completion_length": 2952.041778564453, "epoch": 0.3314285714285714, "grad_norm": 1.1612440347671509, "kl": 0.669189453125, "lambda_div_used": 0.6, "learning_rate": 5.02962191529556e-07, "loss": 0.0857, "reward": -0.076158725656569, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.076158725656569, "reward_after_std": 0.8811142891645432, "reward_before_mean": 0.24741485621780157, "reward_before_std": 0.9051576480269432, "reward_change_max": 0.0009580254554748535, "reward_change_mean": -0.3235735837370157, "reward_change_min": -0.766759853810072, "reward_change_std": 0.29583541117608547, "reward_std": 0.8811143264174461, "rewards/cosine_scaled_reward": -0.16795924259349704, "rewards/format_reward": 0.5833333507180214, "step": 290 }, { "advantage_max": 1.2139404453337193, "advantage_mean": 1.241763414316921e-09, "advantage_min": -0.5720047876238823, "advantage_std": 0.6662472151219845, "completion_length": 2837.1876220703125, "epoch": 0.3325714285714286, "grad_norm": 1.2880405187606812, "kl": 0.488037109375, "lambda_div_used": 0.6, "learning_rate": 4.998389805071536e-07, "loss": 0.0033, "reward": -0.05876442790031433, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05876442790031433, "reward_after_std": 0.6662472151219845, "reward_before_mean": 0.31654983200132847, "reward_before_std": 0.6410909574478865, "reward_change_max": 0.0004924982786178589, "reward_change_mean": -0.3753142673522234, "reward_change_min": -0.7709561586380005, "reward_change_std": 0.28246007952839136, "reward_std": 0.666247233748436, "rewards/cosine_scaled_reward": -0.12297508306801319, "rewards/format_reward": 0.5625000018626451, "step": 291 }, { "advantage_max": 1.1201823130249977, "advantage_mean": 4.96705393482344e-09, "advantage_min": -0.626116156578064, "advantage_std": 0.6261259578168392, "completion_length": 2984.6250610351562, "epoch": 0.33371428571428574, "grad_norm": 0.36917972564697266, "kl": 0.447021484375, "lambda_div_used": 0.6, "learning_rate": 4.967182142620745e-07, "loss": 0.0341, "reward": -0.17521192249841988, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17521192249841988, "reward_after_std": 0.6261259503662586, "reward_before_mean": 0.1484000850468874, "reward_before_std": 0.6327078305184841, "reward_change_max": 0.0006989315152168274, "reward_change_mean": -0.3236120007932186, "reward_change_min": -0.611223328858614, "reward_change_std": 0.2541379611939192, "reward_std": 0.6261259652674198, "rewards/cosine_scaled_reward": -0.16538330353796482, "rewards/format_reward": 0.479166679084301, "step": 292 }, { "advantage_max": 1.234248362481594, "advantage_mean": -3.1044085080367267e-09, "advantage_min": -0.6834790408611298, "advantage_std": 0.679344616830349, "completion_length": 2224.791702270508, "epoch": 0.33485714285714285, "grad_norm": 0.8810564875602722, "kl": 0.2921142578125, "lambda_div_used": 0.6, "learning_rate": 4.93600044896063e-07, "loss": 0.0434, "reward": 0.047704005148261786, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.047704005148261786, "reward_after_std": 0.6793446019291878, "reward_before_mean": 0.4809760805219412, "reward_before_std": 0.6479660160839558, "reward_change_max": 0.0020105764269828796, "reward_change_mean": -0.4332720432430506, "reward_change_min": -0.7641118578612804, "reward_change_std": 0.29899533465504646, "reward_std": 0.6793446280062199, "rewards/cosine_scaled_reward": -0.09284532070159912, "rewards/format_reward": 0.6666666809469461, "step": 293 }, { "advantage_max": 1.6753001362085342, "advantage_mean": 5.587935614226325e-09, "advantage_min": -0.8242075219750404, "advantage_std": 0.9228464774787426, "completion_length": 2752.104232788086, "epoch": 0.336, "grad_norm": 0.8626604080200195, "kl": 0.436859130859375, "lambda_div_used": 0.6, "learning_rate": 4.904846243842949e-07, "loss": 0.0558, "reward": 0.042919889092445374, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.042919889092445374, "reward_after_std": 0.9228464849293232, "reward_before_mean": 0.419957107398659, "reward_before_std": 0.9347262866795063, "reward_change_max": 0.0004857778549194336, "reward_change_mean": -0.3770372150465846, "reward_change_min": -0.7841157354414463, "reward_change_std": 0.3177993157878518, "reward_std": 0.9228465035557747, "rewards/cosine_scaled_reward": 0.012061880202963948, "rewards/format_reward": 0.39583334140479565, "step": 294 }, { "advantage_max": 1.5172969661653042, "advantage_mean": -2.9802324275074454e-08, "advantage_min": -0.8133530095219612, "advantage_std": 0.8478995785117149, "completion_length": 2791.770896911621, "epoch": 0.33714285714285713, "grad_norm": 0.6044384837150574, "kl": 0.42376708984375, "lambda_div_used": 0.6, "learning_rate": 4.873721045679706e-07, "loss": 0.0297, "reward": 0.18310544453561306, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.18310544453561306, "reward_after_std": 0.8478995636105537, "reward_before_mean": 0.6625577192753553, "reward_before_std": 0.8396427370607853, "reward_change_max": 0.0008948817849159241, "reward_change_mean": -0.4794522263109684, "reward_change_min": -0.8777386695146561, "reward_change_std": 0.36423896066844463, "reward_std": 0.8478995896875858, "rewards/cosine_scaled_reward": 0.10211216658353806, "rewards/format_reward": 0.4583333395421505, "step": 295 }, { "advantage_max": 1.1875118091702461, "advantage_mean": 1.3969839091076963e-08, "advantage_min": -0.5539632812142372, "advantage_std": 0.6299429349601269, "completion_length": 3087.104217529297, "epoch": 0.3382857142857143, "grad_norm": 0.39692825078964233, "kl": 0.503662109375, "lambda_div_used": 0.6, "learning_rate": 4.842626371469149e-07, "loss": 0.042, "reward": -0.1963654124410823, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1963654124410823, "reward_after_std": 0.6299429349601269, "reward_before_mean": 0.10445545858237892, "reward_before_std": 0.5950967706739902, "reward_change_max": 0.00122012197971344, "reward_change_mean": -0.3008208554238081, "reward_change_min": -0.5454224087297916, "reward_change_std": 0.2132130330428481, "reward_std": 0.6299429535865784, "rewards/cosine_scaled_reward": -0.2081889410619624, "rewards/format_reward": 0.5208333469927311, "step": 296 }, { "advantage_max": 1.7799878790974617, "advantage_mean": 1.6453366002977532e-08, "advantage_min": -0.6720851957798004, "advantage_std": 0.9102919586002827, "completion_length": 3144.8125762939453, "epoch": 0.3394285714285714, "grad_norm": 1.7664011716842651, "kl": 0.5186767578125, "lambda_div_used": 0.6, "learning_rate": 4.811563736721829e-07, "loss": 0.1237, "reward": -0.2473097420297563, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2473097420297563, "reward_after_std": 0.9102919548749924, "reward_before_mean": -0.04021038330392912, "reward_before_std": 0.8842969015240669, "reward_change_max": 0.0011814385652542114, "reward_change_mean": -0.2070993361994624, "reward_change_min": -0.46454621106386185, "reward_change_std": 0.18554504588246346, "reward_std": 0.910291999578476, "rewards/cosine_scaled_reward": -0.17635520159092266, "rewards/format_reward": 0.31250000558793545, "step": 297 }, { "advantage_max": 1.4756926000118256, "advantage_mean": -1.117587078436344e-08, "advantage_min": -0.6293516084551811, "advantage_std": 0.7791335694491863, "completion_length": 2378.3333740234375, "epoch": 0.3405714285714286, "grad_norm": 0.26499053835868835, "kl": 0.27886962890625, "lambda_div_used": 0.6, "learning_rate": 4.780534655386743e-07, "loss": 0.0181, "reward": 0.03418944403529167, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.03418944403529167, "reward_after_std": 0.7791335843503475, "reward_before_mean": 0.4335096925497055, "reward_before_std": 0.7074716128408909, "reward_change_max": 0.0010121092200279236, "reward_change_mean": -0.3993202708661556, "reward_change_min": -0.686762023717165, "reward_change_std": 0.27223058976233006, "reward_std": 0.779133602976799, "rewards/cosine_scaled_reward": -0.10616182815283537, "rewards/format_reward": 0.6458333469927311, "step": 298 }, { "advantage_max": 1.579380787909031, "advantage_mean": 2.048909764074125e-08, "advantage_min": -0.9291949793696404, "advantage_std": 0.9145001582801342, "completion_length": 3036.6875610351562, "epoch": 0.3417142857142857, "grad_norm": 0.5541883707046509, "kl": 0.4439697265625, "lambda_div_used": 0.6, "learning_rate": 4.749540639777539e-07, "loss": 0.0378, "reward": 0.09226292464882135, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09226292464882135, "reward_after_std": 0.914500180631876, "reward_before_mean": 0.5100114718079567, "reward_before_std": 0.9902982860803604, "reward_change_max": 0.0006761401891708374, "reward_change_mean": -0.41774850990623236, "reward_change_min": -0.9342790246009827, "reward_change_std": 0.379890457727015, "reward_std": 0.9145002067089081, "rewards/cosine_scaled_reward": -0.015827607363462448, "rewards/format_reward": 0.5416666753590107, "step": 299 }, { "advantage_max": 1.6460872441530228, "advantage_mean": -1.1796752630299778e-08, "advantage_min": -0.8271171525120735, "advantage_std": 0.8936281614005566, "completion_length": 2863.0834350585938, "epoch": 0.34285714285714286, "grad_norm": 0.44533947110176086, "kl": 0.3939208984375, "lambda_div_used": 0.6, "learning_rate": 4.7185832004988133e-07, "loss": 0.018, "reward": 0.044442882761359215, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.044442882761359215, "reward_after_std": 0.8936281725764275, "reward_before_mean": 0.4296601233072579, "reward_before_std": 0.8874883726239204, "reward_change_max": 0.001688636839389801, "reward_change_mean": -0.38521726056933403, "reward_change_min": -0.7698900923132896, "reward_change_std": 0.299583924934268, "reward_std": 0.893628191202879, "rewards/cosine_scaled_reward": -0.03516994509845972, "rewards/format_reward": 0.5000000149011612, "step": 300 }, { "advantage_max": 1.1703692227602005, "advantage_mean": 4.0357309161187516e-09, "advantage_min": -0.5398488789796829, "advantage_std": 0.6378449760377407, "completion_length": 2709.562545776367, "epoch": 0.344, "grad_norm": 0.35299813747406006, "kl": 0.46417236328125, "lambda_div_used": 0.6, "learning_rate": 4.68766384637248e-07, "loss": 0.0359, "reward": -0.0903393222251907, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0903393222251907, "reward_after_std": 0.637844979763031, "reward_before_mean": 0.272334769833833, "reward_before_std": 0.5997465178370476, "reward_change_max": 9.758025407791138e-05, "reward_change_mean": -0.36267405189573765, "reward_change_min": -0.6873607896268368, "reward_change_std": 0.2660413235425949, "reward_std": 0.6378450095653534, "rewards/cosine_scaled_reward": -0.13466597348451614, "rewards/format_reward": 0.5416666753590107, "step": 301 }, { "advantage_max": 1.3209370002150536, "advantage_mean": 2.9181441651982e-08, "advantage_min": -0.7568718492984772, "advantage_std": 0.7378169521689415, "completion_length": 2285.541732788086, "epoch": 0.34514285714285714, "grad_norm": 0.6973668336868286, "kl": 0.36492919921875, "lambda_div_used": 0.6, "learning_rate": 4.656784084364238e-07, "loss": 0.0088, "reward": 0.05789472348988056, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05789472348988056, "reward_after_std": 0.7378169372677803, "reward_before_mean": 0.48412832617759705, "reward_before_std": 0.7305796556174755, "reward_change_max": 0.00021295994520187378, "reward_change_mean": -0.4262335952371359, "reward_change_min": -0.7714013680815697, "reward_change_std": 0.3117580823600292, "reward_std": 0.7378169670701027, "rewards/cosine_scaled_reward": -0.028769173979526386, "rewards/format_reward": 0.5416666809469461, "step": 302 }, { "advantage_max": 1.374896951019764, "advantage_mean": 1.7384688355548406e-08, "advantage_min": -0.7600474506616592, "advantage_std": 0.7772323749959469, "completion_length": 2603.68754196167, "epoch": 0.3462857142857143, "grad_norm": 1.0664433240890503, "kl": 0.5365142822265625, "lambda_div_used": 0.6, "learning_rate": 4.6259454195101267e-07, "loss": 0.0164, "reward": -0.05705134989693761, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05705134989693761, "reward_after_std": 0.7772323749959469, "reward_before_mean": 0.3000062759965658, "reward_before_std": 0.8123316094279289, "reward_change_max": 0.0020028576254844666, "reward_change_mean": -0.3570576384663582, "reward_change_min": -0.7174497433006763, "reward_change_std": 0.29492284916341305, "reward_std": 0.7772323973476887, "rewards/cosine_scaled_reward": -0.17291352711617947, "rewards/format_reward": 0.6458333507180214, "step": 303 }, { "advantage_max": 1.230701245367527, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.6204689294099808, "advantage_std": 0.6633154228329659, "completion_length": 2810.8126068115234, "epoch": 0.3474285714285714, "grad_norm": 0.49234816431999207, "kl": 0.545654296875, "lambda_div_used": 0.6, "learning_rate": 4.59514935484316e-07, "loss": 0.0385, "reward": -0.07311620330438018, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07311620330438018, "reward_after_std": 0.6633154153823853, "reward_before_mean": 0.29546307516284287, "reward_before_std": 0.6205927468836308, "reward_change_max": 0.000993296504020691, "reward_change_mean": -0.36857929173856974, "reward_change_min": -0.6805753968656063, "reward_change_std": 0.25794041994959116, "reward_std": 0.6633154340088367, "rewards/cosine_scaled_reward": -0.18560180440545082, "rewards/format_reward": 0.6666666883975267, "step": 304 }, { "advantage_max": 1.3669767379760742, "advantage_mean": 1.0554989493538613e-08, "advantage_min": -0.6285790093243122, "advantage_std": 0.743263740092516, "completion_length": 2943.229202270508, "epoch": 0.3485714285714286, "grad_norm": 0.5039094090461731, "kl": 0.510009765625, "lambda_div_used": 0.6, "learning_rate": 4.5643973913200837e-07, "loss": 0.0536, "reward": -0.028486928436905146, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.028486928436905146, "reward_after_std": 0.7432637549936771, "reward_before_mean": 0.34963250160217285, "reward_before_std": 0.7160216048359871, "reward_change_max": 0.0010071024298667908, "reward_change_mean": -0.37811941001564264, "reward_change_min": -0.6920573115348816, "reward_change_std": 0.2673141350969672, "reward_std": 0.7432637773454189, "rewards/cosine_scaled_reward": -0.15851709432899952, "rewards/format_reward": 0.666666679084301, "step": 305 }, { "advantage_max": 1.5899255201220512, "advantage_mean": -2.825011838347713e-08, "advantage_min": -0.7959227114915848, "advantage_std": 0.8745484538376331, "completion_length": 2690.1459197998047, "epoch": 0.3497142857142857, "grad_norm": 0.4012378752231598, "kl": 0.42352294921875, "lambda_div_used": 0.6, "learning_rate": 4.5336910277482155e-07, "loss": 0.0389, "reward": 0.36468219189555384, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.36468219189555384, "reward_after_std": 0.8745484314858913, "reward_before_mean": 0.9314363989979029, "reward_before_std": 0.8190644308924675, "reward_change_max": 0.0, "reward_change_mean": -0.566754225641489, "reward_change_min": -0.9831234477460384, "reward_change_std": 0.3884328678250313, "reward_std": 0.8745484426617622, "rewards/cosine_scaled_reward": 0.08030151948332787, "rewards/format_reward": 0.7708333544433117, "step": 306 }, { "advantage_max": 1.2558432668447495, "advantage_mean": 4.0357312769412346e-09, "advantage_min": -0.7692880481481552, "advantage_std": 0.7128739319741726, "completion_length": 2790.7291870117188, "epoch": 0.35085714285714287, "grad_norm": 0.741637647151947, "kl": 0.482177734375, "lambda_div_used": 0.6, "learning_rate": 4.503031760712397e-07, "loss": 0.0262, "reward": 0.011465976946055889, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.011465976946055889, "reward_after_std": 0.7128739319741726, "reward_before_mean": 0.42434445582330227, "reward_before_std": 0.719377551227808, "reward_change_max": 0.0, "reward_change_mean": -0.41287848353385925, "reward_change_min": -0.7506432235240936, "reward_change_std": 0.30813906714320183, "reward_std": 0.7128739431500435, "rewards/cosine_scaled_reward": -0.12116110511124134, "rewards/format_reward": 0.6666666883975267, "step": 307 }, { "advantage_max": 1.6454266011714935, "advantage_mean": -1.8626453157644107e-09, "advantage_min": -0.7154504768550396, "advantage_std": 0.8662220053374767, "completion_length": 3161.291748046875, "epoch": 0.352, "grad_norm": 0.6599620580673218, "kl": 0.543212890625, "lambda_div_used": 0.6, "learning_rate": 4.4724210845020494e-07, "loss": 0.0301, "reward": -0.03038888319861144, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.03038888319861144, "reward_after_std": 0.8662219978868961, "reward_before_mean": 0.3153821690939367, "reward_before_std": 0.8270813934504986, "reward_change_max": 0.0, "reward_change_mean": -0.3457710575312376, "reward_change_min": -0.6896585188806057, "reward_change_std": 0.2616619346663356, "reward_std": 0.866222009062767, "rewards/cosine_scaled_reward": -0.12355892173945904, "rewards/format_reward": 0.5625000093132257, "step": 308 }, { "advantage_max": 1.464337058365345, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -0.7494636550545692, "advantage_std": 0.8026157282292843, "completion_length": 3033.3333740234375, "epoch": 0.35314285714285715, "grad_norm": 1.1581108570098877, "kl": 0.415283203125, "lambda_div_used": 0.6, "learning_rate": 4.441860491038345e-07, "loss": 0.0976, "reward": -0.01489423681050539, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.01489423681050539, "reward_after_std": 0.8026156984269619, "reward_before_mean": 0.35966885928064585, "reward_before_std": 0.8019279204308987, "reward_change_max": 0.0005437731742858887, "reward_change_mean": -0.3745631221681833, "reward_change_min": -0.7600988857448101, "reward_change_std": 0.302903912961483, "reward_std": 0.8026157319545746, "rewards/cosine_scaled_reward": -0.10141556803137064, "rewards/format_reward": 0.562500013038516, "step": 309 }, { "advantage_max": 1.4331334754824638, "advantage_mean": 8.69234451084111e-09, "advantage_min": -0.6336538568139076, "advantage_std": 0.7798205390572548, "completion_length": 2275.1458892822266, "epoch": 0.35428571428571426, "grad_norm": 0.4870387017726898, "kl": 0.38494873046875, "lambda_div_used": 0.6, "learning_rate": 4.4113514698014953e-07, "loss": 0.0438, "reward": 0.017508748918771744, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.017508748918771744, "reward_after_std": 0.7798205390572548, "reward_before_mean": 0.4050704315304756, "reward_before_std": 0.749288871884346, "reward_change_max": 0.0, "reward_change_mean": -0.3875616807490587, "reward_change_min": -0.7554353252053261, "reward_change_std": 0.2825618553906679, "reward_std": 0.7798205390572548, "rewards/cosine_scaled_reward": -0.14121478982269764, "rewards/format_reward": 0.6875000093132257, "step": 310 }, { "advantage_max": 1.3528824746608734, "advantage_mean": -1.5522043039783995e-08, "advantage_min": -0.7126908525824547, "advantage_std": 0.7338298484683037, "completion_length": 2572.583427429199, "epoch": 0.3554285714285714, "grad_norm": 0.3856034576892853, "kl": 0.32830810546875, "lambda_div_used": 0.6, "learning_rate": 4.3808955077581546e-07, "loss": 0.0441, "reward": 0.23825683817267418, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.23825683817267418, "reward_after_std": 0.7338298559188843, "reward_before_mean": 0.7636316558346152, "reward_before_std": 0.652825366705656, "reward_change_max": 0.000289328396320343, "reward_change_mean": -0.5253748074173927, "reward_change_min": -0.8759537264704704, "reward_change_std": 0.33924413844943047, "reward_std": 0.7338298633694649, "rewards/cosine_scaled_reward": 0.058899134397506714, "rewards/format_reward": 0.6458333488553762, "step": 311 }, { "advantage_max": 1.5806788802146912, "advantage_mean": -3.414849614191695e-08, "advantage_min": -0.7388174682855606, "advantage_std": 0.843331977725029, "completion_length": 2201.1041870117188, "epoch": 0.3565714285714286, "grad_norm": 0.45848318934440613, "kl": 0.295013427734375, "lambda_div_used": 0.6, "learning_rate": 4.350494089288943e-07, "loss": 0.0287, "reward": 0.41449636314064264, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.41449636314064264, "reward_after_std": 0.8433319795876741, "reward_before_mean": 1.010321255773306, "reward_before_std": 0.7445136643946171, "reward_change_max": 0.0019322633743286133, "reward_change_mean": -0.5958249177783728, "reward_change_min": -1.0556094981729984, "reward_change_std": 0.38764845905825496, "reward_std": 0.8433320187032223, "rewards/cosine_scaled_reward": 0.18224396905861795, "rewards/format_reward": 0.6458333507180214, "step": 312 }, { "advantage_max": 1.4623200297355652, "advantage_mean": 2.545615118698663e-08, "advantage_min": -0.8194458559155464, "advantage_std": 0.8134392872452736, "completion_length": 2906.083396911621, "epoch": 0.3577142857142857, "grad_norm": 0.3625169098377228, "kl": 0.4002685546875, "lambda_div_used": 0.6, "learning_rate": 4.3201486961161093e-07, "loss": 0.0427, "reward": 0.26757752522826195, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.26757752522826195, "reward_after_std": 0.8134392537176609, "reward_before_mean": 0.7992300987243652, "reward_before_std": 0.78279247879982, "reward_change_max": 0.0, "reward_change_mean": -0.5316525120288134, "reward_change_min": -0.9376847445964813, "reward_change_std": 0.3747759759426117, "reward_std": 0.8134392835199833, "rewards/cosine_scaled_reward": 0.13919836655259132, "rewards/format_reward": 0.5208333544433117, "step": 313 }, { "advantage_max": 1.3962865434587002, "advantage_mean": -4.34617203337595e-09, "advantage_min": -0.8272168599069118, "advantage_std": 0.7858781218528748, "completion_length": 2466.541732788086, "epoch": 0.3588571428571429, "grad_norm": 0.4204121530056, "kl": 0.306884765625, "lambda_div_used": 0.6, "learning_rate": 4.2898608072313045e-07, "loss": 0.0102, "reward": 0.10956137627363205, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10956137627363205, "reward_after_std": 0.785878136754036, "reward_before_mean": 0.5606746338307858, "reward_before_std": 0.7906352952122688, "reward_change_max": 0.0007210671901702881, "reward_change_mean": -0.4511132426559925, "reward_change_min": -0.8396630473434925, "reward_change_std": 0.3383765425533056, "reward_std": 0.7858781442046165, "rewards/cosine_scaled_reward": -0.011329350993037224, "rewards/format_reward": 0.583333345130086, "step": 314 }, { "advantage_max": 1.4159748256206512, "advantage_mean": 3.725290464995368e-09, "advantage_min": -0.7663627155125141, "advantage_std": 0.7842210382223129, "completion_length": 2764.104202270508, "epoch": 0.36, "grad_norm": 0.7089937925338745, "kl": 0.353851318359375, "lambda_div_used": 0.6, "learning_rate": 4.2596318988235037e-07, "loss": 0.035, "reward": 0.24042465770617127, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24042465770617127, "reward_after_std": 0.7842210195958614, "reward_before_mean": 0.7616824749857187, "reward_before_std": 0.7417969591915607, "reward_change_max": 0.0007342100143432617, "reward_change_mean": -0.5212578698992729, "reward_change_min": -0.957288570702076, "reward_change_std": 0.36633536219596863, "reward_std": 0.784221027046442, "rewards/cosine_scaled_reward": 0.08917460031807423, "rewards/format_reward": 0.5833333469927311, "step": 315 }, { "advantage_max": 1.3287958353757858, "advantage_mean": -3.104408563547878e-09, "advantage_min": -0.6579811051487923, "advantage_std": 0.731446735560894, "completion_length": 3304.0834045410156, "epoch": 0.36114285714285715, "grad_norm": 0.44676336646080017, "kl": 0.4146728515625, "lambda_div_used": 0.6, "learning_rate": 4.2294634442070553e-07, "loss": 0.0356, "reward": -0.12744826450943947, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.12744826450943947, "reward_after_std": 0.731446735560894, "reward_before_mean": 0.1999343242496252, "reward_before_std": 0.7348340712487698, "reward_change_max": 0.0004966631531715393, "reward_change_mean": -0.3273825887590647, "reward_change_min": -0.6639083586633205, "reward_change_std": 0.2678557615727186, "reward_std": 0.731446772813797, "rewards/cosine_scaled_reward": -0.18128284346312284, "rewards/format_reward": 0.5625000111758709, "step": 316 }, { "advantage_max": 1.4074784219264984, "advantage_mean": -4.967054045845742e-09, "advantage_min": -0.8237576186656952, "advantage_std": 0.7849172949790955, "completion_length": 2783.5209045410156, "epoch": 0.36228571428571427, "grad_norm": 0.7022814750671387, "kl": 0.3697509765625, "lambda_div_used": 0.6, "learning_rate": 4.1993569137498776e-07, "loss": 0.044, "reward": 0.09272369928658009, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09272369928658009, "reward_after_std": 0.7849173173308372, "reward_before_mean": 0.5296291098929942, "reward_before_std": 0.7874506935477257, "reward_change_max": 0.0004237145185470581, "reward_change_mean": -0.43690540827810764, "reward_change_min": -0.8061187639832497, "reward_change_std": 0.33239541947841644, "reward_std": 0.7849173545837402, "rewards/cosine_scaled_reward": -0.03726877458393574, "rewards/format_reward": 0.6041666809469461, "step": 317 }, { "advantage_max": 1.751317985355854, "advantage_mean": -7.45058070794613e-09, "advantage_min": -0.77198251709342, "advantage_std": 0.938833799213171, "completion_length": 2228.666763305664, "epoch": 0.36342857142857143, "grad_norm": 0.5400987863540649, "kl": 0.30938720703125, "lambda_div_used": 0.6, "learning_rate": 4.1693137748017915e-07, "loss": 0.0088, "reward": 0.12596396543085575, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12596396543085575, "reward_after_std": 0.9388338066637516, "reward_before_mean": 0.541341919451952, "reward_before_std": 0.9054854735732079, "reward_change_max": 0.0015599653124809265, "reward_change_mean": -0.4153779400512576, "reward_change_min": -0.8198592364788055, "reward_change_std": 0.31469852663576603, "reward_std": 0.9388338439166546, "rewards/cosine_scaled_reward": -0.09391238272655755, "rewards/format_reward": 0.7291666902601719, "step": 318 }, { "advantage_max": 1.2390740811824799, "advantage_mean": -4.967053657267684e-09, "advantage_min": -0.6836567893624306, "advantage_std": 0.6966056674718857, "completion_length": 3035.0625610351562, "epoch": 0.36457142857142855, "grad_norm": 0.745254635810852, "kl": 0.441162109375, "lambda_div_used": 0.6, "learning_rate": 4.1393354916230005e-07, "loss": 0.0169, "reward": -0.17109492549207062, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.17109492549207062, "reward_after_std": 0.6966056749224663, "reward_before_mean": 0.1381418565288186, "reward_before_std": 0.7229567356407642, "reward_change_max": 0.0007069632411003113, "reward_change_mean": -0.30923677794635296, "reward_change_min": -0.6699307486414909, "reward_change_std": 0.26837681233882904, "reward_std": 0.6966056935489178, "rewards/cosine_scaled_reward": -0.19134573824703693, "rewards/format_reward": 0.5208333395421505, "step": 319 }, { "advantage_max": 1.3388882502913475, "advantage_mean": 2.7939676405797087e-09, "advantage_min": -0.6234576627612114, "advantage_std": 0.7290964983403683, "completion_length": 2174.937545776367, "epoch": 0.3657142857142857, "grad_norm": 0.5195040106773376, "kl": 0.28411865234375, "lambda_div_used": 0.6, "learning_rate": 4.1094235253127374e-07, "loss": 0.0134, "reward": 0.19459236381953815, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19459236381953815, "reward_after_std": 0.7290964983403683, "reward_before_mean": 0.691473500803113, "reward_before_std": 0.6570716165006161, "reward_change_max": 0.0, "reward_change_mean": -0.49688112549483776, "reward_change_min": -0.8854967355728149, "reward_change_std": 0.3344013523310423, "reward_std": 0.7290965430438519, "rewards/cosine_scaled_reward": -0.0605132644996047, "rewards/format_reward": 0.8125000074505806, "step": 320 }, { "advantage_max": 1.7560719028115273, "advantage_mean": 6.208817127095756e-09, "advantage_min": -0.8476521112024784, "advantage_std": 0.9492146447300911, "completion_length": 2399.916732788086, "epoch": 0.3668571428571429, "grad_norm": 1.3485333919525146, "kl": 0.360137939453125, "lambda_div_used": 0.6, "learning_rate": 4.079579333738039e-07, "loss": 0.0709, "reward": 0.32834796188399196, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.32834796188399196, "reward_after_std": 0.9492146372795105, "reward_before_mean": 0.8588424747213139, "reward_before_std": 0.8999190554022789, "reward_change_max": 2.0541250705718994e-05, "reward_change_mean": -0.530494537204504, "reward_change_min": -1.0074727945029736, "reward_change_std": 0.3776666931807995, "reward_std": 0.9492146819829941, "rewards/cosine_scaled_reward": 0.05442123394459486, "rewards/format_reward": 0.7500000074505806, "step": 321 }, { "advantage_max": 1.445715993642807, "advantage_mean": 4.967053768289986e-09, "advantage_min": -0.7693274356424809, "advantage_std": 0.8008316233754158, "completion_length": 2231.7084045410156, "epoch": 0.368, "grad_norm": 1.13021981716156, "kl": 0.28204345703125, "lambda_div_used": 0.6, "learning_rate": 4.0498043714627006e-07, "loss": -0.0215, "reward": -0.010291448445059359, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.010291448445059359, "reward_after_std": 0.8008316159248352, "reward_before_mean": 0.3594322365242988, "reward_before_std": 0.8020209185779095, "reward_change_max": 0.0005329325795173645, "reward_change_mean": -0.36972369998693466, "reward_change_min": -0.701385248452425, "reward_change_std": 0.29476769641041756, "reward_std": 0.8008316159248352, "rewards/cosine_scaled_reward": -0.1744505581445992, "rewards/format_reward": 0.7083333469927311, "step": 322 }, { "advantage_max": 1.3354713916778564, "advantage_mean": 1.3038516211150153e-08, "advantage_min": -0.6649999637156725, "advantage_std": 0.7182338535785675, "completion_length": 2482.291732788086, "epoch": 0.36914285714285716, "grad_norm": 0.6033332347869873, "kl": 0.267822265625, "lambda_div_used": 0.6, "learning_rate": 4.020100089676376e-07, "loss": 0.0009, "reward": 0.02367839589715004, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.02367839589715004, "reward_after_std": 0.7182338386774063, "reward_before_mean": 0.4334249533712864, "reward_before_std": 0.6694931406527758, "reward_change_max": 0.0004015713930130005, "reward_change_mean": -0.40974652022123337, "reward_change_min": -0.7208868563175201, "reward_change_std": 0.27913833782076836, "reward_std": 0.7182338498532772, "rewards/cosine_scaled_reward": -0.1270375456660986, "rewards/format_reward": 0.6875000186264515, "step": 323 }, { "advantage_max": 1.4470875635743141, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.744611281901598, "advantage_std": 0.7921902909874916, "completion_length": 2745.3750610351562, "epoch": 0.3702857142857143, "grad_norm": 0.6796770095825195, "kl": 0.3516845703125, "lambda_div_used": 0.6, "learning_rate": 3.9904679361238526e-07, "loss": -0.0007, "reward": 0.018107005394995213, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.018107005394995213, "reward_after_std": 0.7921902686357498, "reward_before_mean": 0.4099455289542675, "reward_before_std": 0.7908703275024891, "reward_change_max": 0.0, "reward_change_mean": -0.3918385021388531, "reward_change_min": -0.8086014091968536, "reward_change_std": 0.30911492742598057, "reward_std": 0.7921903058886528, "rewards/cosine_scaled_reward": -0.12836058484390378, "rewards/format_reward": 0.6666666753590107, "step": 324 }, { "advantage_max": 1.2714052945375443, "advantage_mean": 1.1175871006408045e-08, "advantage_min": -0.7065200954675674, "advantage_std": 0.7394993230700493, "completion_length": 2904.125045776367, "epoch": 0.37142857142857144, "grad_norm": 0.4810188412666321, "kl": 0.40203857421875, "lambda_div_used": 0.6, "learning_rate": 3.9609093550344907e-07, "loss": 0.0351, "reward": -0.09598593506962061, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.09598593506962061, "reward_after_std": 0.7394993156194687, "reward_before_mean": 0.2538135554641485, "reward_before_std": 0.7984256558120251, "reward_change_max": 0.001150481402873993, "reward_change_mean": -0.3497994728386402, "reward_change_min": -0.7644472010433674, "reward_change_std": 0.3115408755838871, "reward_std": 0.7394993454217911, "rewards/cosine_scaled_reward": -0.13350990042090416, "rewards/format_reward": 0.5208333414047956, "step": 325 }, { "advantage_max": 1.1877973601222038, "advantage_mean": -1.3659397890553038e-08, "advantage_min": -0.6837801039218903, "advantage_std": 0.6724862232804298, "completion_length": 2495.1250534057617, "epoch": 0.37257142857142855, "grad_norm": 0.4340304732322693, "kl": 0.27520751953125, "lambda_div_used": 0.6, "learning_rate": 3.931425787051832e-07, "loss": 0.0059, "reward": 0.0016772449016571045, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0016772449016571045, "reward_after_std": 0.6724862270057201, "reward_before_mean": 0.4161319313570857, "reward_before_std": 0.6646304540336132, "reward_change_max": 0.00013080984354019165, "reward_change_mean": -0.414454716257751, "reward_change_min": -0.7668664567172527, "reward_change_std": 0.3069826615974307, "reward_std": 0.6724862344563007, "rewards/cosine_scaled_reward": -0.11485069431364536, "rewards/format_reward": 0.6458333376795053, "step": 326 }, { "advantage_max": 1.7221521884202957, "advantage_mean": -1.1796752574788627e-08, "advantage_min": -0.9973884001374245, "advantage_std": 0.97059765458107, "completion_length": 2255.8959045410156, "epoch": 0.3737142857142857, "grad_norm": 0.4257287383079529, "kl": 0.24005126953125, "lambda_div_used": 0.6, "learning_rate": 3.902018669163384e-07, "loss": 0.0209, "reward": 0.33461217768490314, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.33461217768490314, "reward_after_std": 0.9705976694822311, "reward_before_mean": 0.872645559720695, "reward_before_std": 0.9799929857254028, "reward_change_max": 0.001069873571395874, "reward_change_mean": -0.5380333829671144, "reward_change_min": -1.0413878448307514, "reward_change_std": 0.41941458359360695, "reward_std": 0.9705976694822311, "rewards/cosine_scaled_reward": 0.11340610310435295, "rewards/format_reward": 0.6458333507180214, "step": 327 }, { "advantage_max": 0.9665052369236946, "advantage_mean": 1.8626452158443385e-08, "advantage_min": -0.5485107228159904, "advantage_std": 0.538653664290905, "completion_length": 3075.6458892822266, "epoch": 0.37485714285714283, "grad_norm": 0.35124486684799194, "kl": 0.323455810546875, "lambda_div_used": 0.6, "learning_rate": 3.872689434630585e-07, "loss": 0.0528, "reward": -0.2801295481622219, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2801295481622219, "reward_after_std": 0.5386536531150341, "reward_before_mean": 0.0012779205571860075, "reward_before_std": 0.5384062603116035, "reward_change_max": 0.0, "reward_change_mean": -0.2814074568450451, "reward_change_min": -0.5551128089427948, "reward_change_std": 0.21838002931326628, "reward_std": 0.5386536568403244, "rewards/cosine_scaled_reward": -0.20769438333809376, "rewards/format_reward": 0.4166666753590107, "step": 328 }, { "advantage_max": 1.6541471779346466, "advantage_mean": 8.692344233285354e-09, "advantage_min": -0.9075762033462524, "advantage_std": 0.9191523641347885, "completion_length": 2187.604223251343, "epoch": 0.376, "grad_norm": 0.6736992001533508, "kl": 0.28594970703125, "lambda_div_used": 0.6, "learning_rate": 3.843439512918949e-07, "loss": 0.0252, "reward": 0.11798514844849706, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11798514844849706, "reward_after_std": 0.9191523641347885, "reward_before_mean": 0.5414314409717917, "reward_before_std": 0.9413018524646759, "reward_change_max": 0.00015210360288619995, "reward_change_mean": -0.4234462957829237, "reward_change_min": -0.8685862831771374, "reward_change_std": 0.34094572998583317, "reward_std": 0.9191523678600788, "rewards/cosine_scaled_reward": -0.010534274391829967, "rewards/format_reward": 0.5625000149011612, "step": 329 }, { "advantage_max": 1.466382622718811, "advantage_mean": 3.414849431004896e-09, "advantage_min": -0.6231764815747738, "advantage_std": 0.7823390513658524, "completion_length": 2061.145896911621, "epoch": 0.37714285714285717, "grad_norm": 0.473938912153244, "kl": 0.232666015625, "lambda_div_used": 0.6, "learning_rate": 3.8142703296283953e-07, "loss": -0.0219, "reward": 0.0415657889097929, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.0415657889097929, "reward_after_std": 0.7823390439152718, "reward_before_mean": 0.4359978437423706, "reward_before_std": 0.7304593771696091, "reward_change_max": 0.0, "reward_change_mean": -0.3944320511072874, "reward_change_min": -0.6981092765927315, "reward_change_std": 0.26706850342452526, "reward_std": 0.7823390439152718, "rewards/cosine_scaled_reward": -0.11533442325890064, "rewards/format_reward": 0.6666666828095913, "step": 330 }, { "advantage_max": 1.2125908732414246, "advantage_mean": 1.2417634698280722e-08, "advantage_min": -0.4857662245631218, "advantage_std": 0.627432931214571, "completion_length": 2819.250045776367, "epoch": 0.3782857142857143, "grad_norm": 0.5773917436599731, "kl": 0.3829345703125, "lambda_div_used": 0.6, "learning_rate": 3.785183306423767e-07, "loss": 0.0198, "reward": -0.22609689529053867, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.22609689529053867, "reward_after_std": 0.6274329125881195, "reward_before_mean": 0.05956357158720493, "reward_before_std": 0.5638223402202129, "reward_change_max": 0.0006118118762969971, "reward_change_mean": -0.28566044569015503, "reward_change_min": -0.5349070765078068, "reward_change_std": 0.19656004663556814, "reward_std": 0.6274329200387001, "rewards/cosine_scaled_reward": -0.17855155700817704, "rewards/format_reward": 0.4166666753590107, "step": 331 }, { "advantage_max": 1.0075550377368927, "advantage_mean": -6.364037768991082e-09, "advantage_min": -0.47515266202390194, "advantage_std": 0.537880215793848, "completion_length": 2479.5209045410156, "epoch": 0.37942857142857145, "grad_norm": 0.8273375034332275, "kl": 0.26776123046875, "lambda_div_used": 0.6, "learning_rate": 3.7561798609655373e-07, "loss": -0.0073, "reward": 0.01811327727045864, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.01811327727045864, "reward_after_std": 0.5378802232444286, "reward_before_mean": 0.46176561154425144, "reward_before_std": 0.43859507143497467, "reward_change_max": 0.0, "reward_change_mean": -0.44365235418081284, "reward_change_min": -0.6914606131613255, "reward_change_std": 0.2646033428609371, "reward_std": 0.5378802306950092, "rewards/cosine_scaled_reward": -0.0712005328387022, "rewards/format_reward": 0.6041666753590107, "step": 332 }, { "advantage_max": 1.6488222405314445, "advantage_mean": -7.450580818968433e-09, "advantage_min": -0.7030692547559738, "advantage_std": 0.8828940503299236, "completion_length": 2545.729248046875, "epoch": 0.38057142857142856, "grad_norm": 0.7539603114128113, "kl": 0.21612548828125, "lambda_div_used": 0.6, "learning_rate": 3.72726140684072e-07, "loss": 0.0547, "reward": 0.24899009801447392, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.24899009801447392, "reward_after_std": 0.882894080132246, "reward_before_mean": 0.7488440172746778, "reward_before_std": 0.8035490922629833, "reward_change_max": 0.0008507147431373596, "reward_change_mean": -0.4998539462685585, "reward_change_min": -0.8932062350213528, "reward_change_std": 0.33497907407581806, "reward_std": 0.882894080132246, "rewards/cosine_scaled_reward": -0.03182799264322966, "rewards/format_reward": 0.8125000186264515, "step": 333 }, { "advantage_max": 1.4733685553073883, "advantage_mean": 4.346172144398253e-09, "advantage_min": -0.7610689476132393, "advantage_std": 0.8075447678565979, "completion_length": 3106.979248046875, "epoch": 0.38171428571428573, "grad_norm": 0.8128080368041992, "kl": 0.3507080078125, "lambda_div_used": 0.6, "learning_rate": 3.6984293534939737e-07, "loss": 0.0485, "reward": -0.10258024837821722, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.10258024837821722, "reward_after_std": 0.8075447678565979, "reward_before_mean": 0.2223309800028801, "reward_before_std": 0.824804563075304, "reward_change_max": 0.002238534390926361, "reward_change_mean": -0.32491121254861355, "reward_change_min": -0.6764849834144115, "reward_change_std": 0.27632055804133415, "reward_std": 0.8075447827577591, "rewards/cosine_scaled_reward": -0.1700845193117857, "rewards/format_reward": 0.5625000167638063, "step": 334 }, { "advantage_max": 1.2056212276220322, "advantage_mean": -7.450580596923828e-09, "advantage_min": -0.6192683838307858, "advantage_std": 0.6638744994997978, "completion_length": 2477.958396911621, "epoch": 0.38285714285714284, "grad_norm": 0.9470255970954895, "kl": 0.269683837890625, "lambda_div_used": 0.6, "learning_rate": 3.6696851061588994e-07, "loss": -0.0232, "reward": -0.05806213865707832, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.05806213865707832, "reward_after_std": 0.6638744957745075, "reward_before_mean": 0.3192488541826606, "reward_before_std": 0.6370225492864847, "reward_change_max": 0.0, "reward_change_mean": -0.37731100153177977, "reward_change_min": -0.67901411652565, "reward_change_std": 0.27379969879984856, "reward_std": 0.6638745106756687, "rewards/cosine_scaled_reward": -0.11120891571044922, "rewards/format_reward": 0.5416666809469461, "step": 335 }, { "advantage_max": 1.6421631425619125, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.955784484744072, "advantage_std": 0.937946081161499, "completion_length": 2682.0834197998047, "epoch": 0.384, "grad_norm": 0.47615712881088257, "kl": 0.2991943359375, "lambda_div_used": 0.6, "learning_rate": 3.641030065789562e-07, "loss": 0.0559, "reward": 0.08940086141228676, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08940086141228676, "reward_after_std": 0.9379460960626602, "reward_before_mean": 0.4974015187472105, "reward_before_std": 1.001080434769392, "reward_change_max": 0.003382869064807892, "reward_change_mean": -0.40800066851079464, "reward_change_min": -0.8810965530574322, "reward_change_std": 0.37354159355163574, "reward_std": 0.9379461221396923, "rewards/cosine_scaled_reward": 0.009117423556745052, "rewards/format_reward": 0.4791666828095913, "step": 336 }, { "advantage_max": 1.554929867386818, "advantage_mean": -8.071462553882469e-09, "advantage_min": -0.6676862463355064, "advantage_std": 0.8406063243746758, "completion_length": 2635.687545776367, "epoch": 0.3851428571428571, "grad_norm": 0.611068844795227, "kl": 0.26171875, "lambda_div_used": 0.6, "learning_rate": 3.612465628992203e-07, "loss": 0.0136, "reward": 0.01666065352037549, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.01666065352037549, "reward_after_std": 0.8406063467264175, "reward_before_mean": 0.39856263156980276, "reward_before_std": 0.8238513432443142, "reward_change_max": 0.0, "reward_change_mean": -0.3819019701331854, "reward_change_min": -0.8491205647587776, "reward_change_std": 0.3106928654015064, "reward_std": 0.8406063690781593, "rewards/cosine_scaled_reward": -0.13405203144066036, "rewards/format_reward": 0.6666666828095913, "step": 337 }, { "advantage_max": 1.361045453697443, "advantage_mean": -1.303851654421706e-08, "advantage_min": -0.605663824826479, "advantage_std": 0.7161907237023115, "completion_length": 2193.229232788086, "epoch": 0.3862857142857143, "grad_norm": 0.2578127682209015, "kl": 0.21331787109375, "lambda_div_used": 0.6, "learning_rate": 3.5839931879571725e-07, "loss": 0.0229, "reward": 0.17649948690086603, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17649948690086603, "reward_after_std": 0.7161907143890858, "reward_before_mean": 0.6687635667622089, "reward_before_std": 0.6105111464858055, "reward_change_max": 0.0, "reward_change_mean": -0.4922640845179558, "reward_change_min": -0.8056236505508423, "reward_change_std": 0.29702320881187916, "reward_std": 0.716190755367279, "rewards/cosine_scaled_reward": -0.03020156484853942, "rewards/format_reward": 0.7291666809469461, "step": 338 }, { "advantage_max": 1.159138262271881, "advantage_mean": -1.8626452602532595e-09, "advantage_min": -0.6259487606585026, "advantage_std": 0.6420266218483448, "completion_length": 2907.5833587646484, "epoch": 0.38742857142857146, "grad_norm": 0.7145529985427856, "kl": 0.2969970703125, "lambda_div_used": 0.6, "learning_rate": 3.555614130391079e-07, "loss": 0.0416, "reward": -0.13409867510199547, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13409867510199547, "reward_after_std": 0.6420266218483448, "reward_before_mean": 0.20468079950660467, "reward_before_std": 0.6328683495521545, "reward_change_max": 0.0004741773009300232, "reward_change_mean": -0.33877946995198727, "reward_change_min": -0.6202888488769531, "reward_change_std": 0.24783700983971357, "reward_std": 0.6420266255736351, "rewards/cosine_scaled_reward": -0.14765961794182658, "rewards/format_reward": 0.5000000074505806, "step": 339 }, { "advantage_max": 1.1203817278146744, "advantage_mean": 9.313226079221693e-09, "advantage_min": -0.5899899490177631, "advantage_std": 0.6205705478787422, "completion_length": 2648.625045776367, "epoch": 0.38857142857142857, "grad_norm": 0.7546307444572449, "kl": 0.317138671875, "lambda_div_used": 0.6, "learning_rate": 3.5273298394491515e-07, "loss": 0.0546, "reward": -0.01088642206741497, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.01088642206741497, "reward_after_std": 0.6205705553293228, "reward_before_mean": 0.39934197315596975, "reward_before_std": 0.5859780795872211, "reward_change_max": 0.0002032071352005005, "reward_change_mean": -0.41022837720811367, "reward_change_min": -0.718485102057457, "reward_change_std": 0.27802254259586334, "reward_std": 0.6205705776810646, "rewards/cosine_scaled_reward": -0.13366236118599772, "rewards/format_reward": 0.6666666809469461, "step": 340 }, { "advantage_max": 1.4242380373179913, "advantage_mean": -9.31322596819939e-09, "advantage_min": -0.628170371055603, "advantage_std": 0.7521231174468994, "completion_length": 2428.0625610351562, "epoch": 0.38971428571428574, "grad_norm": 0.1985091269016266, "kl": 0.1912994384765625, "lambda_div_used": 0.6, "learning_rate": 3.4991416936678276e-07, "loss": 0.0032, "reward": 0.45150601863861084, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.45150601863861084, "reward_after_std": 0.7521231323480606, "reward_before_mean": 1.0890588415786624, "reward_before_std": 0.6112305838614702, "reward_change_max": 0.00016424059867858887, "reward_change_mean": -0.637552828527987, "reward_change_min": -0.9662577249109745, "reward_change_std": 0.3721097456291318, "reward_std": 0.7521231602877378, "rewards/cosine_scaled_reward": 0.22161275381222367, "rewards/format_reward": 0.6458333469927311, "step": 341 }, { "advantage_max": 1.6415075659751892, "advantage_mean": -1.3038516322172455e-08, "advantage_min": -0.7063800022006035, "advantage_std": 0.8682013414800167, "completion_length": 2603.62508392334, "epoch": 0.39085714285714285, "grad_norm": 0.45659804344177246, "kl": 0.3743896484375, "lambda_div_used": 0.6, "learning_rate": 3.471051066897562e-07, "loss": 0.0122, "reward": -0.05222347751259804, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.05222347751259804, "reward_after_std": 0.8682013303041458, "reward_before_mean": 0.27251999638974667, "reward_before_std": 0.8448606841266155, "reward_change_max": 0.0, "reward_change_mean": -0.32474347949028015, "reward_change_min": -0.7152538634836674, "reward_change_std": 0.2609505634754896, "reward_std": 0.8682013601064682, "rewards/cosine_scaled_reward": -0.12415667390450835, "rewards/format_reward": 0.5208333469927311, "step": 342 }, { "advantage_max": 1.53754311054945, "advantage_mean": -3.414849530924968e-08, "advantage_min": -0.8002948388457298, "advantage_std": 0.8562918156385422, "completion_length": 2895.041748046875, "epoch": 0.392, "grad_norm": 0.7427569627761841, "kl": 0.2850341796875, "lambda_div_used": 0.6, "learning_rate": 3.4430593282358777e-07, "loss": 0.0269, "reward": 0.15211634151637554, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15211634151637554, "reward_after_std": 0.8562918230891228, "reward_before_mean": 0.6098415553569794, "reward_before_std": 0.8478217422962189, "reward_change_max": 0.00046613067388534546, "reward_change_mean": -0.4577252510935068, "reward_change_min": -0.887396827340126, "reward_change_std": 0.35419388487935066, "reward_std": 0.8562918454408646, "rewards/cosine_scaled_reward": 0.04450410744175315, "rewards/format_reward": 0.5208333432674408, "step": 343 }, { "advantage_max": 1.3572538942098618, "advantage_mean": -2.4214387106535895e-08, "advantage_min": -0.7771032117307186, "advantage_std": 0.7490360550582409, "completion_length": 2392.708396911621, "epoch": 0.3931428571428571, "grad_norm": 0.37972065806388855, "kl": 0.2348785400390625, "lambda_div_used": 0.6, "learning_rate": 3.4151678419606233e-07, "loss": 0.0198, "reward": 0.4535197149962187, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.4535197149962187, "reward_after_std": 0.7490360513329506, "reward_before_mean": 1.1015524696558714, "reward_before_std": 0.6594033464789391, "reward_change_max": 0.0, "reward_change_mean": -0.6480327611789107, "reward_change_min": -1.0308993272483349, "reward_change_std": 0.3941861046478152, "reward_std": 0.7490360662341118, "rewards/cosine_scaled_reward": 0.16535954643040895, "rewards/format_reward": 0.7708333432674408, "step": 344 }, { "advantage_max": 1.4794992804527283, "advantage_mean": -1.2417633588057697e-09, "advantage_min": -0.8345628753304482, "advantage_std": 0.8316364139318466, "completion_length": 2695.2917098999023, "epoch": 0.3942857142857143, "grad_norm": 0.9218659996986389, "kl": 0.342010498046875, "lambda_div_used": 0.6, "learning_rate": 3.387377967463493e-07, "loss": 0.0479, "reward": 0.1142323762178421, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.1142323762178421, "reward_after_std": 0.8316364139318466, "reward_before_mean": 0.5572534778038971, "reward_before_std": 0.8419076986610889, "reward_change_max": 0.0008572712540626526, "reward_change_mean": -0.44302110746502876, "reward_change_min": -0.8450823724269867, "reward_change_std": 0.3534926138818264, "reward_std": 0.8316364288330078, "rewards/cosine_scaled_reward": -0.03387327026575804, "rewards/format_reward": 0.6250000167638063, "step": 345 }, { "advantage_max": 1.4657285958528519, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.7588490657508373, "advantage_std": 0.8019114583730698, "completion_length": 2404.7709197998047, "epoch": 0.3954285714285714, "grad_norm": 0.458408921957016, "kl": 0.28741455078125, "lambda_div_used": 0.6, "learning_rate": 3.359691059183761e-07, "loss": 0.0563, "reward": 0.17914676276268438, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17914676276268438, "reward_after_std": 0.8019114658236504, "reward_before_mean": 0.6614518268033862, "reward_before_std": 0.7597181908786297, "reward_change_max": 0.0015663355588912964, "reward_change_mean": -0.48230502754449844, "reward_change_min": -0.9021615609526634, "reward_change_std": 0.3469356056302786, "reward_std": 0.801911499351263, "rewards/cosine_scaled_reward": -0.013024099171161652, "rewards/format_reward": 0.687500013038516, "step": 346 }, { "advantage_max": 1.2888052985072136, "advantage_mean": -1.2417633588057697e-09, "advantage_min": -0.6205499656498432, "advantage_std": 0.6916971392929554, "completion_length": 2850.1459045410156, "epoch": 0.3965714285714286, "grad_norm": 1.236312747001648, "kl": 0.4263916015625, "lambda_div_used": 0.6, "learning_rate": 3.3321084665422803e-07, "loss": -0.0104, "reward": -0.07637113053351641, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07637113053351641, "reward_after_std": 0.6916971504688263, "reward_before_mean": 0.28298189770430326, "reward_before_std": 0.6537257945165038, "reward_change_max": 0.0009397566318511963, "reward_change_mean": -0.3593530207872391, "reward_change_min": -0.6594051010906696, "reward_change_std": 0.268814392387867, "reward_std": 0.6916971765458584, "rewards/cosine_scaled_reward": -0.20225906372070312, "rewards/format_reward": 0.6875000055879354, "step": 347 }, { "advantage_max": 1.5206566154956818, "advantage_mean": -4.03573130469681e-09, "advantage_min": -0.7239486165344715, "advantage_std": 0.8120072856545448, "completion_length": 2620.1459045410156, "epoch": 0.3977142857142857, "grad_norm": 0.8215629458427429, "kl": 0.445953369140625, "lambda_div_used": 0.6, "learning_rate": 3.3046315338757026e-07, "loss": 0.0162, "reward": 0.03434617887251079, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.03434617887251079, "reward_after_std": 0.8120072670280933, "reward_before_mean": 0.42738607805222273, "reward_before_std": 0.770159151405096, "reward_change_max": 0.0014997944235801697, "reward_change_mean": -0.39303990080952644, "reward_change_min": -0.7212648764252663, "reward_change_std": 0.2756068855524063, "reward_std": 0.8120072782039642, "rewards/cosine_scaled_reward": -0.11964030953822657, "rewards/format_reward": 0.6666666846722364, "step": 348 }, { "advantage_max": 1.52374729514122, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.7489364519715309, "advantage_std": 0.8232544809579849, "completion_length": 2690.5834350585938, "epoch": 0.39885714285714285, "grad_norm": 0.9394125938415527, "kl": 0.389373779296875, "lambda_div_used": 0.6, "learning_rate": 3.2772616003709616e-07, "loss": 0.0465, "reward": 0.016556567046791315, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.016556567046791315, "reward_after_std": 0.8232545182108879, "reward_before_mean": 0.39954477082937956, "reward_before_std": 0.8015697486698627, "reward_change_max": 0.00015076994895935059, "reward_change_mean": -0.3829882238060236, "reward_change_min": -0.7486894652247429, "reward_change_std": 0.28054554015398026, "reward_std": 0.8232545331120491, "rewards/cosine_scaled_reward": -0.06064428063109517, "rewards/format_reward": 0.5208333432674408, "step": 349 }, { "advantage_max": 1.5367366746068, "advantage_mean": -9.313225746154785e-09, "advantage_min": -0.6593474373221397, "advantage_std": 0.809534341096878, "completion_length": 2321.9375610351562, "epoch": 0.4, "grad_norm": 1.0577102899551392, "kl": 0.451416015625, "lambda_div_used": 0.6, "learning_rate": 3.250000000000001e-07, "loss": -0.0017, "reward": 0.031073355115950108, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.031073355115950108, "reward_after_std": 0.8095343261957169, "reward_before_mean": 0.4226370635442436, "reward_before_std": 0.7477500736713409, "reward_change_max": 0.0005329474806785583, "reward_change_mean": -0.3915637172758579, "reward_change_min": -0.7794366888701916, "reward_change_std": 0.2754232231527567, "reward_std": 0.8095343485474586, "rewards/cosine_scaled_reward": -0.09076481917873025, "rewards/format_reward": 0.6041666828095913, "step": 350 }, { "advantage_max": 1.559445295482874, "advantage_mean": -1.2417632477834672e-09, "advantage_min": -0.9915315806865692, "advantage_std": 0.9171740040183067, "completion_length": 2582.5208740234375, "epoch": 0.40114285714285713, "grad_norm": 0.5777731537818909, "kl": 0.343505859375, "lambda_div_used": 0.6, "learning_rate": 3.222848061454764e-07, "loss": 0.0083, "reward": 0.33935420773923397, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.33935420773923397, "reward_after_std": 0.9171739853918552, "reward_before_mean": 0.8997167088091373, "reward_before_std": 0.9683718234300613, "reward_change_max": 0.0, "reward_change_mean": -0.5603625010699034, "reward_change_min": -1.0499442629516125, "reward_change_std": 0.4368437882512808, "reward_std": 0.9171739853918552, "rewards/cosine_scaled_reward": 0.07485833764076233, "rewards/format_reward": 0.7500000149011612, "step": 351 }, { "advantage_max": 1.3428475260734558, "advantage_mean": -6.8296991950766994e-09, "advantage_min": -0.5568768233060837, "advantage_std": 0.7152614146471024, "completion_length": 2627.5833587646484, "epoch": 0.4022857142857143, "grad_norm": 0.8404443860054016, "kl": 0.407501220703125, "lambda_div_used": 0.6, "learning_rate": 3.195807108082429e-07, "loss": 0.0119, "reward": 0.12012724267970043, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12012724267970043, "reward_after_std": 0.7152614146471024, "reward_before_mean": 0.5803326356690377, "reward_before_std": 0.6293035577982664, "reward_change_max": 0.0056512728333473206, "reward_change_mean": -0.46020539198070765, "reward_change_min": -0.7574254907667637, "reward_change_std": 0.3005659803748131, "reward_std": 0.7152614295482635, "rewards/cosine_scaled_reward": -0.043167030438780785, "rewards/format_reward": 0.6666666734963655, "step": 352 }, { "advantage_max": 1.4187793508172035, "advantage_mean": -1.8626451825376478e-08, "advantage_min": -0.6966684348881245, "advantage_std": 0.7699579037725925, "completion_length": 2302.0000762939453, "epoch": 0.4034285714285714, "grad_norm": 0.48950308561325073, "kl": 0.3104248046875, "lambda_div_used": 0.6, "learning_rate": 3.168878457820915e-07, "loss": 0.0198, "reward": 0.4192593709449284, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4192593709449284, "reward_after_std": 0.7699578814208508, "reward_before_mean": 1.0370823116973042, "reward_before_std": 0.6642060168087482, "reward_change_max": 0.0, "reward_change_mean": -0.6178229376673698, "reward_change_min": -0.9664259105920792, "reward_change_std": 0.3706759735941887, "reward_std": 0.7699579186737537, "rewards/cosine_scaled_reward": 0.08104114048182964, "rewards/format_reward": 0.8750000111758709, "step": 353 }, { "advantage_max": 1.3239182755351067, "advantage_mean": -2.1109979320144134e-08, "advantage_min": -0.7877817675471306, "advantage_std": 0.7418571058660746, "completion_length": 2000.5417442321777, "epoch": 0.4045714285714286, "grad_norm": 0.5796849727630615, "kl": 0.25811767578125, "lambda_div_used": 0.6, "learning_rate": 3.142063423134644e-07, "loss": 0.0468, "reward": 0.25255117658525705, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.25255117658525705, "reward_after_std": 0.7418571207672358, "reward_before_mean": 0.7909721238538623, "reward_before_std": 0.7007397972047329, "reward_change_max": 0.0011800751090049744, "reward_change_mean": -0.5384209705516696, "reward_change_min": -0.8849985972046852, "reward_change_std": 0.35534715466201305, "reward_std": 0.7418571468442678, "rewards/cosine_scaled_reward": 0.07256940566003323, "rewards/format_reward": 0.6458333414047956, "step": 354 }, { "advantage_max": 1.7609666138887405, "advantage_mean": -1.8626451825376478e-08, "advantage_min": -0.7916901037096977, "advantage_std": 0.9337159469723701, "completion_length": 2235.041702270508, "epoch": 0.4057142857142857, "grad_norm": 0.5317572951316833, "kl": 0.322509765625, "lambda_div_used": 0.6, "learning_rate": 3.115363310950578e-07, "loss": 0.0388, "reward": 0.3990977890789509, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3990977890789509, "reward_after_std": 0.9337159618735313, "reward_before_mean": 0.9705567192286253, "reward_before_std": 0.8250516392290592, "reward_change_max": 0.0004812106490135193, "reward_change_mean": -0.5714588910341263, "reward_change_min": -0.9926439747214317, "reward_change_std": 0.3731604963541031, "reward_std": 0.9337159991264343, "rewards/cosine_scaled_reward": 0.131111660040915, "rewards/format_reward": 0.7083333395421505, "step": 355 }, { "advantage_max": 1.5015154480934143, "advantage_mean": -1.4280279736489376e-08, "advantage_min": -0.8326525948941708, "advantage_std": 0.8183581568300724, "completion_length": 2302.104232788086, "epoch": 0.40685714285714286, "grad_norm": 0.4310028553009033, "kl": 0.28448486328125, "lambda_div_used": 0.6, "learning_rate": 3.0887794225945143e-07, "loss": 0.0207, "reward": 0.2149380873888731, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2149380873888731, "reward_after_std": 0.8183581568300724, "reward_before_mean": 0.7093568500131369, "reward_before_std": 0.7762251533567905, "reward_change_max": 0.0, "reward_change_mean": -0.4944187719374895, "reward_change_min": -0.8521025627851486, "reward_change_std": 0.33673784136772156, "reward_std": 0.8183581717312336, "rewards/cosine_scaled_reward": 0.0630117617547512, "rewards/format_reward": 0.5833333432674408, "step": 356 }, { "advantage_max": 1.3827911913394928, "advantage_mean": -1.024454859832602e-08, "advantage_min": -0.8060562089085579, "advantage_std": 0.7842890731990337, "completion_length": 2924.6458740234375, "epoch": 0.408, "grad_norm": 0.5307123064994812, "kl": 0.4029541015625, "lambda_div_used": 0.6, "learning_rate": 3.062313053727671e-07, "loss": 0.0524, "reward": 0.13803019496845081, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13803019496845081, "reward_after_std": 0.7842890806496143, "reward_before_mean": 0.603233439847827, "reward_before_std": 0.7885280232876539, "reward_change_max": 0.0005675703287124634, "reward_change_mean": -0.46520326659083366, "reward_change_min": -0.8700950965285301, "reward_change_std": 0.34981742314994335, "reward_std": 0.7842891179025173, "rewards/cosine_scaled_reward": -0.0942166093736887, "rewards/format_reward": 0.7916666939854622, "step": 357 }, { "advantage_max": 1.597570613026619, "advantage_mean": -1.6142924885720333e-08, "advantage_min": -0.905064333230257, "advantage_std": 0.8988454565405846, "completion_length": 2193.166732788086, "epoch": 0.40914285714285714, "grad_norm": 0.45412543416023254, "kl": 0.317138671875, "lambda_div_used": 0.6, "learning_rate": 3.0359654942835247e-07, "loss": 0.0298, "reward": 0.4682135407347232, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.4682135407347232, "reward_after_std": 0.8988454788923264, "reward_before_mean": 1.089598136022687, "reward_before_std": 0.8698871731758118, "reward_change_max": 0.0, "reward_change_mean": -0.6213845498859882, "reward_change_min": -1.0914196744561195, "reward_change_std": 0.4229634404182434, "reward_std": 0.8988455012440681, "rewards/cosine_scaled_reward": 0.1697990447282791, "rewards/format_reward": 0.7500000149011612, "step": 358 }, { "advantage_max": 1.3419801220297813, "advantage_mean": 6.208817460162663e-09, "advantage_min": -0.618110153824091, "advantage_std": 0.7201356999576092, "completion_length": 2469.8333740234375, "epoch": 0.4102857142857143, "grad_norm": 1.6776347160339355, "kl": 0.29156494140625, "lambda_div_used": 0.6, "learning_rate": 3.0097380284049523e-07, "loss": -0.0181, "reward": 0.19314294261857867, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19314294261857867, "reward_after_std": 0.7201356925070286, "reward_before_mean": 0.6933410288766026, "reward_before_std": 0.6368299573659897, "reward_change_max": 0.0, "reward_change_mean": -0.5001981183886528, "reward_change_min": -0.8563997894525528, "reward_change_std": 0.3123783506453037, "reward_std": 0.7201357260346413, "rewards/cosine_scaled_reward": -0.05957947578281164, "rewards/format_reward": 0.8125000111758709, "step": 359 }, { "advantage_max": 1.674271047115326, "advantage_mean": -1.8626452213954536e-08, "advantage_min": -0.9208538271486759, "advantage_std": 0.9478093385696411, "completion_length": 2530.729248046875, "epoch": 0.4114285714285714, "grad_norm": 1.0951354503631592, "kl": 0.30206298828125, "lambda_div_used": 0.6, "learning_rate": 2.9836319343816397e-07, "loss": 0.0726, "reward": 0.30874237247917335, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.30874237247917335, "reward_after_std": 0.9478093311190605, "reward_before_mean": 0.8401943445205688, "reward_before_std": 0.9553318619728088, "reward_change_max": 0.00043720006942749023, "reward_change_mean": -0.5314519703388214, "reward_change_min": -1.0268866904079914, "reward_change_std": 0.41884367167949677, "reward_std": 0.9478093758225441, "rewards/cosine_scaled_reward": 0.024263825733214617, "rewards/format_reward": 0.7916666939854622, "step": 360 }, { "advantage_max": 1.4925316274166107, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.7755002863705158, "advantage_std": 0.7997158244252205, "completion_length": 2971.166717529297, "epoch": 0.4125714285714286, "grad_norm": 0.4740391969680786, "kl": 0.4354248046875, "lambda_div_used": 0.6, "learning_rate": 2.9576484845877793e-07, "loss": 0.0421, "reward": 0.13191415555775166, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13191415555775166, "reward_after_std": 0.7997158244252205, "reward_before_mean": 0.5852155089378357, "reward_before_std": 0.7439655251801014, "reward_change_max": 0.0, "reward_change_mean": -0.4533013366162777, "reward_change_min": -0.7497556209564209, "reward_change_std": 0.3015373144298792, "reward_std": 0.7997158318758011, "rewards/cosine_scaled_reward": -0.08239225693978369, "rewards/format_reward": 0.7500000223517418, "step": 361 }, { "advantage_max": 1.263748835772276, "advantage_mean": -1.2728076231871555e-08, "advantage_min": -0.7216246016323566, "advantage_std": 0.717674445360899, "completion_length": 1809.4792175292969, "epoch": 0.4137142857142857, "grad_norm": 0.5852110385894775, "kl": 0.2550048828125, "lambda_div_used": 0.6, "learning_rate": 2.931788945420058e-07, "loss": 0.0364, "reward": 0.09919328487012535, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09919328487012535, "reward_after_std": 0.7176744528114796, "reward_before_mean": 0.5574979344382882, "reward_before_std": 0.7097251052036881, "reward_change_max": 0.0014338567852973938, "reward_change_mean": -0.45830463618040085, "reward_change_min": -0.8685710802674294, "reward_change_std": 0.3367288615554571, "reward_std": 0.7176744528114796, "rewards/cosine_scaled_reward": -0.0858343681320548, "rewards/format_reward": 0.7291666828095913, "step": 362 }, { "advantage_max": 1.4477052614092827, "advantage_mean": -8.692344122263052e-09, "advantage_min": -0.6502137240022421, "advantage_std": 0.7600477300584316, "completion_length": 1877.1042098999023, "epoch": 0.41485714285714287, "grad_norm": 0.4388819932937622, "kl": 0.316253662109375, "lambda_div_used": 0.6, "learning_rate": 2.9060545772359305e-07, "loss": 0.0497, "reward": 0.12381302984431386, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12381302984431386, "reward_after_std": 0.7600477263331413, "reward_before_mean": 0.5726947877556086, "reward_before_std": 0.6706202179193497, "reward_change_max": 0.0, "reward_change_mean": -0.44888175278902054, "reward_change_min": -0.7377006858587265, "reward_change_std": 0.2865642663091421, "reward_std": 0.7600477486848831, "rewards/cosine_scaled_reward": -0.06781928800046444, "rewards/format_reward": 0.7083333469927311, "step": 363 }, { "advantage_max": 1.0907182395458221, "advantage_mean": 1.8626449826975033e-09, "advantage_min": -0.6054031513631344, "advantage_std": 0.6060111746191978, "completion_length": 2772.291748046875, "epoch": 0.416, "grad_norm": 0.3423631191253662, "kl": 0.384765625, "lambda_div_used": 0.6, "learning_rate": 2.8804466342921987e-07, "loss": 0.0308, "reward": -0.1971070682629943, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1971070682629943, "reward_after_std": 0.6060111969709396, "reward_before_mean": 0.11542971897870302, "reward_before_std": 0.6073710769414902, "reward_change_max": 0.000298522412776947, "reward_change_mean": -0.31253678910434246, "reward_change_min": -0.6107521578669548, "reward_change_std": 0.24526064470410347, "reward_std": 0.606011226773262, "rewards/cosine_scaled_reward": -0.2860351409763098, "rewards/format_reward": 0.6875000149011612, "step": 364 }, { "advantage_max": 1.2950073555111885, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.7644446343183517, "advantage_std": 0.7370525486767292, "completion_length": 2624.729248046875, "epoch": 0.41714285714285715, "grad_norm": 0.7466763854026794, "kl": 0.3004150390625, "lambda_div_used": 0.6, "learning_rate": 2.854966364683872e-07, "loss": 0.0028, "reward": 0.08483308926224709, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08483308926224709, "reward_after_std": 0.7370525635778904, "reward_before_mean": 0.531798891723156, "reward_before_std": 0.7544467896223068, "reward_change_max": 0.0004674270749092102, "reward_change_mean": -0.4469658061861992, "reward_change_min": -0.8575878739356995, "reward_change_std": 0.33127186819911003, "reward_std": 0.7370525784790516, "rewards/cosine_scaled_reward": -0.004933889955282211, "rewards/format_reward": 0.5416666679084301, "step": 365 }, { "advantage_max": 1.6756793335080147, "advantage_mean": -2.6697913713125132e-08, "advantage_min": -0.8520163223147392, "advantage_std": 0.9116014242172241, "completion_length": 2026.9375457763672, "epoch": 0.41828571428571426, "grad_norm": 0.7191529870033264, "kl": 0.226348876953125, "lambda_div_used": 0.6, "learning_rate": 2.829615010283344e-07, "loss": 0.0346, "reward": 0.32207756396383047, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.32207756396383047, "reward_after_std": 0.9116014316678047, "reward_before_mean": 0.8569716177880764, "reward_before_std": 0.8602619133889675, "reward_change_max": 0.005871579051017761, "reward_change_mean": -0.5348940622061491, "reward_change_min": -0.9633550941944122, "reward_change_std": 0.385333601385355, "reward_std": 0.9116014465689659, "rewards/cosine_scaled_reward": 0.08473577909171581, "rewards/format_reward": 0.6875000223517418, "step": 366 }, { "advantage_max": 1.5158573985099792, "advantage_mean": 1.676380712023473e-08, "advantage_min": -0.8118532225489616, "advantage_std": 0.8657009229063988, "completion_length": 2827.0208740234375, "epoch": 0.41942857142857143, "grad_norm": 0.9370729327201843, "kl": 0.374908447265625, "lambda_div_used": 0.6, "learning_rate": 2.8043938066798645e-07, "loss": 0.0387, "reward": 0.0068329013884067535, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0068329013884067535, "reward_after_std": 0.8657009117305279, "reward_before_mean": 0.3866802779957652, "reward_before_std": 0.9256978183984756, "reward_change_max": 0.0018185079097747803, "reward_change_mean": -0.3798473794013262, "reward_change_min": -0.850589819252491, "reward_change_std": 0.34286472108215094, "reward_std": 0.86570093780756, "rewards/cosine_scaled_reward": -0.0774932000786066, "rewards/format_reward": 0.5416666828095913, "step": 367 }, { "advantage_max": 1.2594080492854118, "advantage_mean": -5.277494885547185e-09, "advantage_min": -0.5854111053049564, "advantage_std": 0.6686476469039917, "completion_length": 3070.0834045410156, "epoch": 0.4205714285714286, "grad_norm": 0.603573739528656, "kl": 0.31689453125, "lambda_div_used": 0.6, "learning_rate": 2.7793039831193133e-07, "loss": 0.02, "reward": 0.11028159782290459, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11028159782290459, "reward_after_std": 0.6686476245522499, "reward_before_mean": 0.5779907759279013, "reward_before_std": 0.5762103162705898, "reward_change_max": 0.0005208030343055725, "reward_change_mean": -0.46770920045673847, "reward_change_min": -0.7650781571865082, "reward_change_std": 0.2938290312886238, "reward_std": 0.668647650629282, "rewards/cosine_scaled_reward": -0.054754629731178284, "rewards/format_reward": 0.687500013038516, "step": 368 }, { "advantage_max": 1.7339719384908676, "advantage_mean": -6.829698862009792e-09, "advantage_min": -0.8681919798254967, "advantage_std": 0.9657414592802525, "completion_length": 2561.6459350585938, "epoch": 0.4217142857142857, "grad_norm": 0.8986064791679382, "kl": 0.28192138671875, "lambda_div_used": 0.6, "learning_rate": 2.7543467624442956e-07, "loss": 0.0236, "reward": 0.07137996330857277, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07137996330857277, "reward_after_std": 0.9657414592802525, "reward_before_mean": 0.4605141952633858, "reward_before_std": 1.0080587305128574, "reward_change_max": 0.0023592039942741394, "reward_change_mean": -0.38913422264158726, "reward_change_min": -0.8434956669807434, "reward_change_std": 0.34523880016058683, "reward_std": 0.9657415077090263, "rewards/cosine_scaled_reward": -0.07182625401765108, "rewards/format_reward": 0.6041666772216558, "step": 369 }, { "advantage_max": 1.1579053699970245, "advantage_mean": 2.5456151797609294e-08, "advantage_min": -0.5593762882053852, "advantage_std": 0.6268951632082462, "completion_length": 2476.4375762939453, "epoch": 0.4228571428571429, "grad_norm": 0.6041207909584045, "kl": 0.2985076904296875, "lambda_div_used": 0.6, "learning_rate": 2.729523361034538e-07, "loss": -0.0035, "reward": 0.02673279179725796, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.02673279179725796, "reward_after_std": 0.6268951632082462, "reward_before_mean": 0.45535836229100823, "reward_before_std": 0.5718512460589409, "reward_change_max": 0.00044342875480651855, "reward_change_mean": -0.428625563159585, "reward_change_min": -0.7268031612038612, "reward_change_std": 0.28326542116701603, "reward_std": 0.6268951743841171, "rewards/cosine_scaled_reward": -0.08482082560658455, "rewards/format_reward": 0.6250000186264515, "step": 370 }, { "advantage_max": 1.182312160730362, "advantage_mean": -7.450580929990736e-09, "advantage_min": -0.6014973521232605, "advantage_std": 0.6524987481534481, "completion_length": 1996.7083435058594, "epoch": 0.424, "grad_norm": 0.3128429651260376, "kl": 0.255828857421875, "lambda_div_used": 0.6, "learning_rate": 2.7048349887476037e-07, "loss": 0.0346, "reward": 0.329143688082695, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.329143688082695, "reward_after_std": 0.6524987518787384, "reward_before_mean": 0.9214862994849682, "reward_before_std": 0.5499526411294937, "reward_change_max": 0.00031290203332901, "reward_change_mean": -0.5923426318913698, "reward_change_min": -0.95097865909338, "reward_change_std": 0.3701365366578102, "reward_std": 0.6524987779557705, "rewards/cosine_scaled_reward": 0.0857431460171938, "rewards/format_reward": 0.7500000055879354, "step": 371 }, { "advantage_max": 1.4317540675401688, "advantage_mean": -1.800557042352935e-08, "advantage_min": -0.6863468810915947, "advantage_std": 0.752282090485096, "completion_length": 2944.2500915527344, "epoch": 0.42514285714285716, "grad_norm": 0.6662515997886658, "kl": 0.368804931640625, "lambda_div_used": 0.6, "learning_rate": 2.6802828488599294e-07, "loss": 0.0396, "reward": 0.14754285477101803, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.14754285477101803, "reward_after_std": 0.7522820681333542, "reward_before_mean": 0.6144732628017664, "reward_before_std": 0.6542432829737663, "reward_change_max": 0.00022286921739578247, "reward_change_mean": -0.46693046763539314, "reward_change_min": -0.7703455798327923, "reward_change_std": 0.3007415384054184, "reward_std": 0.7522820681333542, "rewards/cosine_scaled_reward": 0.00515330582857132, "rewards/format_reward": 0.6041666716337204, "step": 372 }, { "advantage_max": 1.57389435172081, "advantage_mean": -2.297262396977473e-08, "advantage_min": -0.8639702498912811, "advantage_std": 0.8862636424601078, "completion_length": 1759.6667175292969, "epoch": 0.42628571428571427, "grad_norm": 0.27647003531455994, "kl": 0.2136993408203125, "lambda_div_used": 0.6, "learning_rate": 2.655868138008171e-07, "loss": -0.0049, "reward": 0.1719374004751444, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1719374004751444, "reward_after_std": 0.886263657361269, "reward_before_mean": 0.6331824623048306, "reward_before_std": 0.9114639163017273, "reward_change_max": 0.00457899272441864, "reward_change_mean": -0.4612450823187828, "reward_change_min": -0.9425050392746925, "reward_change_std": 0.37030158564448357, "reward_std": 0.8862636797130108, "rewards/cosine_scaled_reward": -0.03757543582469225, "rewards/format_reward": 0.7083333469927311, "step": 373 }, { "advantage_max": 1.6222765147686005, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.8567904755473137, "advantage_std": 0.8852887079119682, "completion_length": 2289.354217529297, "epoch": 0.42742857142857144, "grad_norm": 0.6635468602180481, "kl": 0.295684814453125, "lambda_div_used": 0.6, "learning_rate": 2.631592046130896e-07, "loss": 0.0296, "reward": 0.23773611336946487, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23773611336946487, "reward_after_std": 0.8852887004613876, "reward_before_mean": 0.7330911438912153, "reward_before_std": 0.8372438997030258, "reward_change_max": 0.0004977062344551086, "reward_change_mean": -0.4953550100326538, "reward_change_min": -0.85561952739954, "reward_change_std": 0.35369635559618473, "reward_std": 0.8852887377142906, "rewards/cosine_scaled_reward": 0.054045562632381916, "rewards/format_reward": 0.6250000186264515, "step": 374 }, { "advantage_max": 1.8102723434567451, "advantage_mean": -6.208817460162663e-09, "advantage_min": -0.8086415976285934, "advantage_std": 0.9527671858668327, "completion_length": 2324.6458740234375, "epoch": 0.42857142857142855, "grad_norm": 0.6844770312309265, "kl": 0.289886474609375, "lambda_div_used": 0.6, "learning_rate": 2.6074557564105724e-07, "loss": 0.0403, "reward": 0.1343357115983963, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1343357115983963, "reward_after_std": 0.9527672007679939, "reward_before_mean": 0.5494648832827806, "reward_before_std": 0.8957410082221031, "reward_change_max": 0.0022966861724853516, "reward_change_mean": -0.4151291511952877, "reward_change_min": -0.7877466715872288, "reward_change_std": 0.3019845802336931, "reward_std": 0.9527672305703163, "rewards/cosine_scaled_reward": -0.08985091745853424, "rewards/format_reward": 0.7291666902601719, "step": 375 }, { "advantage_max": 1.3477432578802109, "advantage_mean": 6.208817682207268e-09, "advantage_min": -0.7531176581978798, "advantage_std": 0.7548925988376141, "completion_length": 2447.2917251586914, "epoch": 0.4297142857142857, "grad_norm": 1.0445984601974487, "kl": 0.40826416015625, "lambda_div_used": 0.6, "learning_rate": 2.583460445215911e-07, "loss": 0.0045, "reward": 0.044009771198034286, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.044009771198034286, "reward_after_std": 0.7548925764858723, "reward_before_mean": 0.46251408383250237, "reward_before_std": 0.7559454832226038, "reward_change_max": 0.0011388212442398071, "reward_change_mean": -0.41850431449711323, "reward_change_min": -0.7947998829185963, "reward_change_std": 0.3245093934237957, "reward_std": 0.7548926025629044, "rewards/cosine_scaled_reward": -0.12290963158011436, "rewards/format_reward": 0.7083333507180214, "step": 376 }, { "advantage_max": 1.4688562378287315, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.6980493329465389, "advantage_std": 0.7906424179673195, "completion_length": 2726.3541870117188, "epoch": 0.4308571428571429, "grad_norm": 0.40274783968925476, "kl": 0.328857421875, "lambda_div_used": 0.6, "learning_rate": 2.5596072820445254e-07, "loss": 0.0206, "reward": 0.2742973640561104, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2742973640561104, "reward_after_std": 0.7906424179673195, "reward_before_mean": 0.8079462740570307, "reward_before_std": 0.6935162991285324, "reward_change_max": 0.0008223950862884521, "reward_change_mean": -0.5336488857865334, "reward_change_min": -0.8744311928749084, "reward_change_std": 0.3369473982602358, "reward_std": 0.7906424328684807, "rewards/cosine_scaled_reward": 0.03938978351652622, "rewards/format_reward": 0.729166679084301, "step": 377 }, { "advantage_max": 1.7968226224184036, "advantage_mean": -7.450580929990736e-09, "advantage_min": -0.9664948359131813, "advantage_std": 0.9928526803851128, "completion_length": 2514.2709159851074, "epoch": 0.432, "grad_norm": 0.9523245692253113, "kl": 0.306884765625, "lambda_div_used": 0.6, "learning_rate": 2.5358974294659373e-07, "loss": 0.0033, "reward": 0.536355035379529, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.536355035379529, "reward_after_std": 0.9928526468575001, "reward_before_mean": 1.1798031572252512, "reward_before_std": 0.9377436637878418, "reward_change_max": 0.0010698065161705017, "reward_change_mean": -0.6434481181204319, "reward_change_min": -1.0964898467063904, "reward_change_std": 0.43880243599414825, "reward_std": 0.9928526617586613, "rewards/cosine_scaled_reward": 0.17323489300906658, "rewards/format_reward": 0.8333333432674408, "step": 378 }, { "advantage_max": 1.465286523103714, "advantage_mean": -2.2895014217816367e-09, "advantage_min": -0.7986491471529007, "advantage_std": 0.8143725395202637, "completion_length": 2638.291702270508, "epoch": 0.43314285714285716, "grad_norm": 0.5379257202148438, "kl": 0.4554443359375, "lambda_div_used": 0.6, "learning_rate": 2.512332043064913e-07, "loss": 0.0501, "reward": 0.009639232186600566, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.009639232186600566, "reward_after_std": 0.8143725544214249, "reward_before_mean": 0.39682869613170624, "reward_before_std": 0.832601822912693, "reward_change_max": 0.00031301379203796387, "reward_change_mean": -0.3871894534677267, "reward_change_min": -0.7461157105863094, "reward_change_std": 0.301790377125144, "reward_std": 0.8143725767731667, "rewards/cosine_scaled_reward": -0.13491899985820055, "rewards/format_reward": 0.6666666939854622, "step": 379 }, { "advantage_max": 1.2629759535193443, "advantage_mean": -1.6763806953701277e-08, "advantage_min": -0.8979735262691975, "advantage_std": 0.7433908097445965, "completion_length": 2067.666732788086, "epoch": 0.4342857142857143, "grad_norm": 0.7482815980911255, "kl": 0.200897216796875, "lambda_div_used": 0.6, "learning_rate": 2.488912271385139e-07, "loss": 0.0249, "reward": 0.12406534794718027, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12406534794718027, "reward_after_std": 0.7433908171951771, "reward_before_mean": 0.5958141903392971, "reward_before_std": 0.7890961095690727, "reward_change_max": 0.0007266402244567871, "reward_change_mean": -0.47174884378910065, "reward_change_min": -0.871945433318615, "reward_change_std": 0.3627962898463011, "reward_std": 0.7433908544480801, "rewards/cosine_scaled_reward": -0.03542623296380043, "rewards/format_reward": 0.666666679084301, "step": 380 }, { "advantage_max": 1.2968988865613937, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.703218549489975, "advantage_std": 0.7332097329199314, "completion_length": 2710.437530517578, "epoch": 0.43542857142857144, "grad_norm": 0.4402064085006714, "kl": 0.405792236328125, "lambda_div_used": 0.6, "learning_rate": 2.465639255873246e-07, "loss": 0.0172, "reward": 0.010914841666817665, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.010914841666817665, "reward_after_std": 0.7332097329199314, "reward_before_mean": 0.41524326987564564, "reward_before_std": 0.7420315518975258, "reward_change_max": 0.0018931254744529724, "reward_change_mean": -0.4043284226208925, "reward_change_min": -0.7467291578650475, "reward_change_std": 0.3047961834818125, "reward_std": 0.733209740370512, "rewards/cosine_scaled_reward": -0.13612838461995125, "rewards/format_reward": 0.6875000111758709, "step": 381 }, { "advantage_max": 1.5032060518860817, "advantage_mean": 6.829699084054397e-09, "advantage_min": -0.7504288945347071, "advantage_std": 0.7993675842881203, "completion_length": 2214.041748046875, "epoch": 0.43657142857142855, "grad_norm": 0.4265294671058655, "kl": 0.3445281982421875, "lambda_div_used": 0.6, "learning_rate": 2.4425141308231765e-07, "loss": 0.0432, "reward": 0.11006945720873773, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11006945720873773, "reward_after_std": 0.7993675693869591, "reward_before_mean": 0.5445982171222568, "reward_before_std": 0.7350498847663403, "reward_change_max": 0.0, "reward_change_mean": -0.43452877178788185, "reward_change_min": -0.7362462729215622, "reward_change_std": 0.29714934155344963, "reward_std": 0.7993675693869591, "rewards/cosine_scaled_reward": -0.11311756214126945, "rewards/format_reward": 0.7708333507180214, "step": 382 }, { "advantage_max": 1.6201047748327255, "advantage_mean": -2.607703353252333e-08, "advantage_min": -0.8351623527705669, "advantage_std": 0.8896178603172302, "completion_length": 3022.916748046875, "epoch": 0.4377142857142857, "grad_norm": 0.9334142208099365, "kl": 0.3941650390625, "lambda_div_used": 0.6, "learning_rate": 2.4195380233209006e-07, "loss": 0.0189, "reward": 0.4147815710166469, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4147815710166469, "reward_after_std": 0.8896179012954235, "reward_before_mean": 1.0097543355077505, "reward_before_std": 0.8233935404568911, "reward_change_max": 0.0, "reward_change_mean": -0.5949728116393089, "reward_change_min": -1.0136651918292046, "reward_change_std": 0.3960152920335531, "reward_std": 0.8896179012954235, "rewards/cosine_scaled_reward": 0.12987715937197208, "rewards/format_reward": 0.7500000111758709, "step": 383 }, { "advantage_max": 1.6311913207173347, "advantage_mean": -4.5324366204635425e-08, "advantage_min": -0.8875380381941795, "advantage_std": 0.9250488579273224, "completion_length": 1581.3125305175781, "epoch": 0.43885714285714283, "grad_norm": 0.9882168173789978, "kl": 0.163604736328125, "lambda_div_used": 0.6, "learning_rate": 2.3967120531894857e-07, "loss": 0.0593, "reward": 0.5434218298178166, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5434218298178166, "reward_after_std": 0.9250488504767418, "reward_before_mean": 1.2076098858378828, "reward_before_std": 0.8887167200446129, "reward_change_max": 0.0017512813210487366, "reward_change_mean": -0.664188090711832, "reward_change_min": -1.1719777584075928, "reward_change_std": 0.461810689419508, "reward_std": 0.9250488728284836, "rewards/cosine_scaled_reward": 0.26005493476986885, "rewards/format_reward": 0.6875000074505806, "step": 384 }, { "advantage_max": 1.601989060640335, "advantage_mean": -8.692344288796505e-09, "advantage_min": -0.7441845238208771, "advantage_std": 0.8422457277774811, "completion_length": 2135.791732788086, "epoch": 0.44, "grad_norm": 0.6853197813034058, "kl": 0.221923828125, "lambda_div_used": 0.6, "learning_rate": 2.374037332934512e-07, "loss": 0.0038, "reward": 0.20848367968574166, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.20848367968574166, "reward_after_std": 0.8422457128763199, "reward_before_mean": 0.6883424371480942, "reward_before_std": 0.7463456504046917, "reward_change_max": 0.0, "reward_change_mean": -0.47985872626304626, "reward_change_min": -0.8243284970521927, "reward_change_std": 0.3069882392883301, "reward_std": 0.8422457501292229, "rewards/cosine_scaled_reward": -0.030828803312033415, "rewards/format_reward": 0.7500000260770321, "step": 385 }, { "advantage_max": 1.5159094631671906, "advantage_mean": -7.45058070794613e-09, "advantage_min": -0.7980938293039799, "advantage_std": 0.8149101585149765, "completion_length": 2838.8750915527344, "epoch": 0.44114285714285717, "grad_norm": 0.46451351046562195, "kl": 0.37481689453125, "lambda_div_used": 0.6, "learning_rate": 2.3515149676898552e-07, "loss": 0.0264, "reward": 0.2753634084947407, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2753634084947407, "reward_after_std": 0.8149101585149765, "reward_before_mean": 0.8048880062997341, "reward_before_std": 0.7382436729967594, "reward_change_max": 0.0003626197576522827, "reward_change_mean": -0.5295246057212353, "reward_change_min": -0.8955358266830444, "reward_change_std": 0.346976475790143, "reward_std": 0.8149101696908474, "rewards/cosine_scaled_reward": 0.058694007340818644, "rewards/format_reward": 0.6875000111758709, "step": 386 }, { "advantage_max": 1.5711029320955276, "advantage_mean": 1.8626450937198058e-09, "advantage_min": -0.7883075922727585, "advantage_std": 0.8619452305138111, "completion_length": 2691.604232788086, "epoch": 0.4422857142857143, "grad_norm": 0.39026057720184326, "kl": 0.354156494140625, "lambda_div_used": 0.6, "learning_rate": 2.3291460551638237e-07, "loss": 0.0308, "reward": 0.08357733162119985, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08357733162119985, "reward_after_std": 0.8619452491402626, "reward_before_mean": 0.4990644520148635, "reward_before_std": 0.8501224853098392, "reward_change_max": 0.0007921233773231506, "reward_change_mean": -0.41548711247742176, "reward_change_min": -0.7823773622512817, "reward_change_std": 0.32049538008868694, "reward_std": 0.8619452938437462, "rewards/cosine_scaled_reward": -0.08380109909921885, "rewards/format_reward": 0.6666666734963655, "step": 387 }, { "advantage_max": 1.3556857779622078, "advantage_mean": 2.4835265510780857e-09, "advantage_min": -0.755245964974165, "advantage_std": 0.7445164695382118, "completion_length": 2470.8334045410156, "epoch": 0.44342857142857145, "grad_norm": 0.5328013896942139, "kl": 0.38946533203125, "lambda_div_used": 0.6, "learning_rate": 2.306931685585657e-07, "loss": 0.0397, "reward": 0.27331763273105025, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.27331763273105025, "reward_after_std": 0.7445164769887924, "reward_before_mean": 0.8157808966934681, "reward_before_std": 0.6723228208720684, "reward_change_max": 0.0001545920968055725, "reward_change_mean": -0.5424632374197245, "reward_change_min": -0.9075334519147873, "reward_change_std": 0.3518282901495695, "reward_std": 0.7445164918899536, "rewards/cosine_scaled_reward": 0.064140435308218, "rewards/format_reward": 0.6875000242143869, "step": 388 }, { "advantage_max": 1.287089891731739, "advantage_mean": -6.208817571184966e-09, "advantage_min": -0.6707641631364822, "advantage_std": 0.7045476697385311, "completion_length": 2096.25008392334, "epoch": 0.44457142857142856, "grad_norm": 0.724722146987915, "kl": 0.24822998046875, "lambda_div_used": 0.6, "learning_rate": 2.2848729416523859e-07, "loss": -0.0112, "reward": 0.03399020340293646, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.03399020340293646, "reward_after_std": 0.7045476622879505, "reward_before_mean": 0.45298708602786064, "reward_before_std": 0.6788679845631123, "reward_change_max": 0.0008410587906837463, "reward_change_mean": -0.41899688611738384, "reward_change_min": -0.7899326011538506, "reward_change_std": 0.2929133272264153, "reward_std": 0.7045476697385311, "rewards/cosine_scaled_reward": -0.10683980397880077, "rewards/format_reward": 0.6666666772216558, "step": 389 }, { "advantage_max": 1.5262744650244713, "advantage_mean": -1.1796752852344383e-08, "advantage_min": -0.7143342308700085, "advantage_std": 0.8563202805817127, "completion_length": 2600.5625762939453, "epoch": 0.44571428571428573, "grad_norm": 1.5622819662094116, "kl": 0.375518798828125, "lambda_div_used": 0.6, "learning_rate": 2.2629708984760706e-07, "loss": 0.035, "reward": 0.043615717673674226, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.043615717673674226, "reward_after_std": 0.8563202731311321, "reward_before_mean": 0.44365750439465046, "reward_before_std": 0.8856248073279858, "reward_change_max": 0.003797389566898346, "reward_change_mean": -0.40004180651158094, "reward_change_min": -0.9396551251411438, "reward_change_std": 0.34944793209433556, "reward_std": 0.8563202954828739, "rewards/cosine_scaled_reward": -0.10108792106620967, "rewards/format_reward": 0.6458333432674408, "step": 390 }, { "advantage_max": 1.7013674080371857, "advantage_mean": -9.313225857177088e-09, "advantage_min": -0.8481043614447117, "advantage_std": 0.9216614998877048, "completion_length": 2286.1042098999023, "epoch": 0.44685714285714284, "grad_norm": 0.7252182960510254, "kl": 0.368560791015625, "lambda_div_used": 0.6, "learning_rate": 2.2412266235313973e-07, "loss": 0.0725, "reward": 0.17385222483426332, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17385222483426332, "reward_after_std": 0.9216614998877048, "reward_before_mean": 0.6222281260415912, "reward_before_std": 0.8909232281148434, "reward_change_max": 0.0006584227085113525, "reward_change_mean": -0.4483759216964245, "reward_change_min": -0.8788629658520222, "reward_change_std": 0.3533425759524107, "reward_std": 0.9216615185141563, "rewards/cosine_scaled_reward": 0.05069740046747029, "rewards/format_reward": 0.5208333376795053, "step": 391 }, { "advantage_max": 1.3481401428580284, "advantage_mean": -9.934108202713787e-09, "advantage_min": -0.8792018666863441, "advantage_std": 0.7797165811061859, "completion_length": 2121.3542098999023, "epoch": 0.448, "grad_norm": 0.6497393250465393, "kl": 0.25140380859375, "lambda_div_used": 0.6, "learning_rate": 2.2196411766036487e-07, "loss": 0.0394, "reward": 0.280883414670825, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.280883414670825, "reward_after_std": 0.7797165736556053, "reward_before_mean": 0.8301408728584647, "reward_before_std": 0.7800439111888409, "reward_change_max": 0.000479675829410553, "reward_change_mean": -0.549257442355156, "reward_change_min": -0.9315827190876007, "reward_change_std": 0.3869467042386532, "reward_std": 0.7797165811061859, "rewards/cosine_scaled_reward": 0.01923706941306591, "rewards/format_reward": 0.7916666828095913, "step": 392 }, { "advantage_max": 1.978425569832325, "advantage_mean": -2.2972624191819335e-08, "advantage_min": -0.8907108381390572, "advantage_std": 1.0441827401518822, "completion_length": 2310.4583892822266, "epoch": 0.4491428571428571, "grad_norm": 0.5620689392089844, "kl": 0.23760986328125, "lambda_div_used": 0.6, "learning_rate": 2.1982156097370557e-07, "loss": -0.007, "reward": 0.18085258081555367, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18085258081555367, "reward_after_std": 1.0441827550530434, "reward_before_mean": 0.6036436296999454, "reward_before_std": 0.999367343261838, "reward_change_max": 0.0009649470448493958, "reward_change_mean": -0.42279105074703693, "reward_change_min": -0.8772672526538372, "reward_change_std": 0.33015530183911324, "reward_std": 1.0441827774047852, "rewards/cosine_scaled_reward": -0.03151152500322496, "rewards/format_reward": 0.6666666846722364, "step": 393 }, { "advantage_max": 1.2289385050535202, "advantage_mean": 7.450580818968433e-09, "advantage_min": -0.5500497072935104, "advantage_std": 0.640130028128624, "completion_length": 2670.7292404174805, "epoch": 0.4502857142857143, "grad_norm": 0.37953662872314453, "kl": 0.3760986328125, "lambda_div_used": 0.6, "learning_rate": 2.1769509671835223e-07, "loss": 0.0425, "reward": -0.11808802396990359, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11808802396990359, "reward_after_std": 0.6401300318539143, "reward_before_mean": 0.21933782380074263, "reward_before_std": 0.5647947750985622, "reward_change_max": 2.1852552890777588e-05, "reward_change_mean": -0.3374258540570736, "reward_change_min": -0.5754900723695755, "reward_change_std": 0.21995082311332226, "reward_std": 0.6401300616562366, "rewards/cosine_scaled_reward": -0.24449776113033295, "rewards/format_reward": 0.7083333432674408, "step": 394 }, { "advantage_max": 1.7114560678601265, "advantage_mean": -2.8560559695023358e-08, "advantage_min": -0.7531477734446526, "advantage_std": 0.915810015052557, "completion_length": 2337.000030517578, "epoch": 0.4514285714285714, "grad_norm": 0.5603395700454712, "kl": 0.28656005859375, "lambda_div_used": 0.6, "learning_rate": 2.1558482853517253e-07, "loss": 0.0182, "reward": 0.30787351354956627, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.30787351354956627, "reward_after_std": 0.9158100299537182, "reward_before_mean": 0.824693463742733, "reward_before_std": 0.8460011817514896, "reward_change_max": 0.0010833367705345154, "reward_change_mean": -0.5168199352920055, "reward_change_min": -0.9332334361970425, "reward_change_std": 0.3591248635202646, "reward_std": 0.9158100374042988, "rewards/cosine_scaled_reward": 0.05818005627952516, "rewards/format_reward": 0.708333345130086, "step": 395 }, { "advantage_max": 1.3889750689268112, "advantage_mean": -1.8626448716752009e-09, "advantage_min": -0.7343614771962166, "advantage_std": 0.7592885904014111, "completion_length": 2536.3750762939453, "epoch": 0.45257142857142857, "grad_norm": 0.4565255641937256, "kl": 0.31463623046875, "lambda_div_used": 0.6, "learning_rate": 2.134908592756607e-07, "loss": 0.0303, "reward": 0.32924389315303415, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.32924389315303415, "reward_after_std": 0.7592885978519917, "reward_before_mean": 0.9013736853376031, "reward_before_std": 0.6819272413849831, "reward_change_max": 0.0, "reward_change_mean": -0.5721297487616539, "reward_change_min": -0.9597470313310623, "reward_change_std": 0.3658205308020115, "reward_std": 0.7592885978519917, "rewards/cosine_scaled_reward": 0.06527014682069421, "rewards/format_reward": 0.7708333507180214, "step": 396 }, { "advantage_max": 1.3574538677930832, "advantage_mean": -1.5522042984272844e-08, "advantage_min": -0.5243251714855433, "advantage_std": 0.7060611471533775, "completion_length": 2016.8542175292969, "epoch": 0.45371428571428574, "grad_norm": 0.8537499904632568, "kl": 0.1929931640625, "lambda_div_used": 0.6, "learning_rate": 2.1141329099692406e-07, "loss": -0.034, "reward": 0.11655983421951532, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11655983421951532, "reward_after_std": 0.7060611508786678, "reward_before_mean": 0.5711096227169037, "reward_before_std": 0.579546969383955, "reward_change_max": 0.001223057508468628, "reward_change_mean": -0.45454983226954937, "reward_change_min": -0.7644696235656738, "reward_change_std": 0.28570349514484406, "reward_std": 0.7060611769556999, "rewards/cosine_scaled_reward": -0.037361856549978256, "rewards/format_reward": 0.6458333432674408, "step": 397 }, { "advantage_max": 1.2000513300299644, "advantage_mean": -9.93410742555767e-09, "advantage_min": -0.8511512242257595, "advantage_std": 0.6951940208673477, "completion_length": 2343.541748046875, "epoch": 0.45485714285714285, "grad_norm": 0.4527902603149414, "kl": 0.2850341796875, "lambda_div_used": 0.6, "learning_rate": 2.0935222495670968e-07, "loss": 0.0159, "reward": 0.004995799623429775, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.004995799623429775, "reward_after_std": 0.6951940171420574, "reward_before_mean": 0.41582280572038144, "reward_before_std": 0.7327194809913635, "reward_change_max": 0.002899445593357086, "reward_change_mean": -0.41082701925188303, "reward_change_min": -0.7185042686760426, "reward_change_std": 0.3099668100476265, "reward_std": 0.6951940283179283, "rewards/cosine_scaled_reward": -0.11500527150928974, "rewards/format_reward": 0.6458333544433117, "step": 398 }, { "advantage_max": 1.6650248989462852, "advantage_mean": -3.0112764060064023e-08, "advantage_min": -0.7992283068597317, "advantage_std": 0.9040088169276714, "completion_length": 2293.0209045410156, "epoch": 0.456, "grad_norm": 0.3662654459476471, "kl": 0.20367431640625, "lambda_div_used": 0.6, "learning_rate": 2.0730776160846853e-07, "loss": 0.0122, "reward": 0.42200227081775665, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.42200227081775665, "reward_after_std": 0.904008824378252, "reward_before_mean": 1.0163545496761799, "reward_before_std": 0.8316931277513504, "reward_change_max": 0.0, "reward_change_mean": -0.5943523086607456, "reward_change_min": -1.0510099232196808, "reward_change_std": 0.3899985756725073, "reward_std": 0.9040088318288326, "rewards/cosine_scaled_reward": 0.07067725621163845, "rewards/format_reward": 0.8750000149011612, "step": 399 }, { "advantage_max": 1.7473459392786026, "advantage_mean": -1.5522043039783995e-08, "advantage_min": -0.8987687975168228, "advantage_std": 0.9387243762612343, "completion_length": 1392.0416946411133, "epoch": 0.45714285714285713, "grad_norm": 0.2878487706184387, "kl": 0.1201019287109375, "lambda_div_used": 0.6, "learning_rate": 2.0528000059645995e-07, "loss": -0.0384, "reward": 0.5482377011794597, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5482377011794597, "reward_after_std": 0.9387243762612343, "reward_before_mean": 1.1946721822023392, "reward_before_std": 0.8241510428488255, "reward_change_max": 0.000455513596534729, "reward_change_mean": -0.6464345380663872, "reward_change_min": -1.0573916137218475, "reward_change_std": 0.41280957497656345, "reward_std": 0.9387244209647179, "rewards/cosine_scaled_reward": 0.2223361013457179, "rewards/format_reward": 0.7500000204890966, "step": 400 }, { "advantage_max": 1.1178666576743126, "advantage_mean": -1.676380706472358e-08, "advantage_min": -0.4279431030154228, "advantage_std": 0.5744078829884529, "completion_length": 2834.541778564453, "epoch": 0.4582857142857143, "grad_norm": 0.5082582235336304, "kl": 0.5018310546875, "lambda_div_used": 0.6, "learning_rate": 2.032690407508949e-07, "loss": 0.0078, "reward": 0.255753539968282, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.255753539968282, "reward_after_std": 0.5744078755378723, "reward_before_mean": 0.814882205799222, "reward_before_std": 0.3851259686052799, "reward_change_max": 0.0, "reward_change_mean": -0.5591286532580853, "reward_change_min": -0.8304774910211563, "reward_change_std": 0.30313936062157154, "reward_std": 0.5744078904390335, "rewards/cosine_scaled_reward": 0.03244107961654663, "rewards/format_reward": 0.7500000018626451, "step": 401 }, { "advantage_max": 1.2266887575387955, "advantage_mean": 1.3038516710750514e-08, "advantage_min": -0.5537307001650333, "advantage_std": 0.6451385356485844, "completion_length": 2054.104248046875, "epoch": 0.4594285714285714, "grad_norm": 1.4331732988357544, "kl": 2.166656494140625, "lambda_div_used": 0.6, "learning_rate": 2.0127498008311922e-07, "loss": 0.0531, "reward": 0.015745140612125397, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.015745140612125397, "reward_after_std": 0.645138543099165, "reward_before_mean": 0.4195268111070618, "reward_before_std": 0.547524556517601, "reward_change_max": 0.0012333318591117859, "reward_change_mean": -0.40378167293965816, "reward_change_min": -0.6718766801059246, "reward_change_std": 0.261042807251215, "reward_std": 0.6451385729014874, "rewards/cosine_scaled_reward": -0.12356993323192, "rewards/format_reward": 0.6666666772216558, "step": 402 }, { "advantage_max": 1.441964067518711, "advantage_mean": -6.208817349140361e-09, "advantage_min": -0.8136776760220528, "advantage_std": 0.816083088517189, "completion_length": 2189.562568664551, "epoch": 0.4605714285714286, "grad_norm": 1.0570858716964722, "kl": 0.165130615234375, "lambda_div_used": 0.6, "learning_rate": 1.9929791578083655e-07, "loss": 0.0366, "reward": 0.23330267828714568, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23330267828714568, "reward_after_std": 0.8160830736160278, "reward_before_mean": 0.7426377143710852, "reward_before_std": 0.8013063948601484, "reward_change_max": 0.0, "reward_change_mean": -0.5093350410461426, "reward_change_min": -0.9062354825437069, "reward_change_std": 0.3685335274785757, "reward_std": 0.8160831034183502, "rewards/cosine_scaled_reward": -0.03493115585297346, "rewards/format_reward": 0.8125000111758709, "step": 403 }, { "advantage_max": 1.1642425507307053, "advantage_mean": -1.2417634254191512e-08, "advantage_min": -0.6793310269713402, "advantage_std": 0.6530807539820671, "completion_length": 2348.687530517578, "epoch": 0.4617142857142857, "grad_norm": 0.3486461639404297, "kl": 0.2562255859375, "lambda_div_used": 0.6, "learning_rate": 1.9733794420337213e-07, "loss": -0.005, "reward": 0.22327465657144785, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22327465657144785, "reward_after_std": 0.6530807465314865, "reward_before_mean": 0.7535381922498345, "reward_before_std": 0.6034115515649319, "reward_change_max": 0.0, "reward_change_mean": -0.530263539403677, "reward_change_min": -0.8887680508196354, "reward_change_std": 0.3427054435014725, "reward_std": 0.6530807688832283, "rewards/cosine_scaled_reward": 0.0017690882086753845, "rewards/format_reward": 0.7500000186264515, "step": 404 }, { "advantage_max": 1.8272784128785133, "advantage_mean": -7.45058115203534e-09, "advantage_min": -0.9271564707159996, "advantage_std": 1.0041028670966625, "completion_length": 2259.083381652832, "epoch": 0.46285714285714286, "grad_norm": 0.5416133403778076, "kl": 0.2439727783203125, "lambda_div_used": 0.6, "learning_rate": 1.9539516087697517e-07, "loss": 0.0235, "reward": 0.5748835622798651, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5748835622798651, "reward_after_std": 1.0041028745472431, "reward_before_mean": 1.2313530333340168, "reward_before_std": 0.9382626451551914, "reward_change_max": 0.0, "reward_change_mean": -0.656469464302063, "reward_change_min": -1.1503315716981888, "reward_change_std": 0.444169782102108, "reward_std": 1.0041029155254364, "rewards/cosine_scaled_reward": 0.17817650269716978, "rewards/format_reward": 0.8750000223517418, "step": 405 }, { "advantage_max": 1.7873649969696999, "advantage_mean": -2.42143873285805e-08, "advantage_min": -0.8977884463965893, "advantage_std": 0.9721400141716003, "completion_length": 2179.4376068115234, "epoch": 0.464, "grad_norm": 1.5086603164672852, "kl": 0.2163543701171875, "lambda_div_used": 0.6, "learning_rate": 1.934696604901642e-07, "loss": 0.0538, "reward": 0.5057008937001228, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5057008937001228, "reward_after_std": 0.9721400439739227, "reward_before_mean": 1.1307021956890821, "reward_before_std": 0.8954973742365837, "reward_change_max": 0.0, "reward_change_mean": -0.6250013373792171, "reward_change_min": -1.0632744282484055, "reward_change_std": 0.4240496251732111, "reward_std": 0.9721400514245033, "rewards/cosine_scaled_reward": 0.13826777413487434, "rewards/format_reward": 0.8541666865348816, "step": 406 }, { "advantage_max": 1.568674311041832, "advantage_mean": -2.23517424569053e-08, "advantage_min": -0.8621452823281288, "advantage_std": 0.8588430657982826, "completion_length": 2311.687568664551, "epoch": 0.46514285714285714, "grad_norm": 0.7728492617607117, "kl": 0.22466278076171875, "lambda_div_used": 0.6, "learning_rate": 1.915615368891117e-07, "loss": 0.0356, "reward": 0.48120962642133236, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.48120962642133236, "reward_after_std": 0.8588430657982826, "reward_before_mean": 1.1148183699697256, "reward_before_std": 0.7778550088405609, "reward_change_max": 0.0, "reward_change_mean": -0.6336087584495544, "reward_change_min": -1.072540633380413, "reward_change_std": 0.4125585276633501, "reward_std": 0.858843095600605, "rewards/cosine_scaled_reward": 0.1511591738089919, "rewards/format_reward": 0.8125000186264515, "step": 407 }, { "advantage_max": 1.6463065594434738, "advantage_mean": -4.84287747681833e-08, "advantage_min": -0.7306011319160461, "advantage_std": 0.8598592802882195, "completion_length": 2620.1250610351562, "epoch": 0.4662857142857143, "grad_norm": 0.7048142552375793, "kl": 0.27455902099609375, "lambda_div_used": 0.6, "learning_rate": 1.8967088307307e-07, "loss": 0.0408, "reward": 0.36233993619680405, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.36233993619680405, "reward_after_std": 0.8598593175411224, "reward_before_mean": 0.9261019452242181, "reward_before_std": 0.7249259613454342, "reward_change_max": 0.0001357346773147583, "reward_change_mean": -0.5637620557099581, "reward_change_min": -0.8630594946444035, "reward_change_std": 0.34440525993704796, "reward_std": 0.8598593324422836, "rewards/cosine_scaled_reward": 0.1297176331281662, "rewards/format_reward": 0.6666666772216558, "step": 408 }, { "advantage_max": 1.3177231326699257, "advantage_mean": -3.166496848061584e-08, "advantage_min": -0.5865827538073063, "advantage_std": 0.7076354771852493, "completion_length": 3044.291717529297, "epoch": 0.4674285714285714, "grad_norm": 0.6633644104003906, "kl": 0.371734619140625, "lambda_div_used": 0.6, "learning_rate": 1.8779779118983867e-07, "loss": 0.035, "reward": 0.18076253915205598, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18076253915205598, "reward_after_std": 0.707635473459959, "reward_before_mean": 0.6803056970238686, "reward_before_std": 0.6231886614114046, "reward_change_max": 0.0009818002581596375, "reward_change_mean": -0.499543197453022, "reward_change_min": -0.8707900457084179, "reward_change_std": 0.32813665829598904, "reward_std": 0.7076354995369911, "rewards/cosine_scaled_reward": -0.045263820327818394, "rewards/format_reward": 0.7708333432674408, "step": 409 }, { "advantage_max": 1.5697736218571663, "advantage_mean": -1.6142925107764938e-08, "advantage_min": -0.8108692672103643, "advantage_std": 0.8806155137717724, "completion_length": 2453.520851135254, "epoch": 0.4685714285714286, "grad_norm": 0.7028084993362427, "kl": 0.3839569091796875, "lambda_div_used": 0.6, "learning_rate": 1.8594235253127372e-07, "loss": 0.0156, "reward": 0.1786555778235197, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1786555778235197, "reward_after_std": 0.8806155323982239, "reward_before_mean": 0.6418251923751086, "reward_before_std": 0.8865823717787862, "reward_change_max": 0.001151353120803833, "reward_change_mean": -0.4631696194410324, "reward_change_min": -0.9242124333977699, "reward_change_std": 0.37072835117578506, "reward_std": 0.8806155361235142, "rewards/cosine_scaled_reward": -0.03325407952070236, "rewards/format_reward": 0.7083333469927311, "step": 410 }, { "advantage_max": 1.6483391597867012, "advantage_mean": -1.2417631367611648e-09, "advantage_min": -0.9043002128601074, "advantage_std": 0.9175063073635101, "completion_length": 3027.6043090820312, "epoch": 0.4697142857142857, "grad_norm": 1.045861840248108, "kl": 0.3138427734375, "lambda_div_used": 0.6, "learning_rate": 1.8410465752883758e-07, "loss": 0.0091, "reward": 0.3661972675472498, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3661972675472498, "reward_after_std": 0.9175063371658325, "reward_before_mean": 0.9325102139264345, "reward_before_std": 0.8878533262759447, "reward_change_max": 0.0, "reward_change_mean": -0.5663129389286041, "reward_change_min": -0.9974900856614113, "reward_change_std": 0.4016368221491575, "reward_std": 0.9175063446164131, "rewards/cosine_scaled_reward": 0.049588436260819435, "rewards/format_reward": 0.8333333432674408, "step": 411 }, { "advantage_max": 1.6660237908363342, "advantage_mean": -8.692344122263052e-09, "advantage_min": -0.8078876323997974, "advantage_std": 0.8997826427221298, "completion_length": 2970.166748046875, "epoch": 0.47085714285714286, "grad_norm": 1.1745421886444092, "kl": 0.33782958984375, "lambda_div_used": 0.6, "learning_rate": 1.822847957491922e-07, "loss": 0.0598, "reward": 0.41099782660603523, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.41099782660603523, "reward_after_std": 0.8997826538980007, "reward_before_mean": 0.9954136228188872, "reward_before_std": 0.8094833288341761, "reward_change_max": 0.0, "reward_change_mean": -0.5844158306717873, "reward_change_min": -0.9936563968658447, "reward_change_std": 0.3828421086072922, "reward_std": 0.8997826650738716, "rewards/cosine_scaled_reward": 0.10187347792088985, "rewards/format_reward": 0.7916666828095913, "step": 412 }, { "advantage_max": 1.453200839459896, "advantage_mean": -3.228585088166369e-08, "advantage_min": -0.7565282955765724, "advantage_std": 0.7962896563112736, "completion_length": 2726.9375610351562, "epoch": 0.472, "grad_norm": 0.5472252368927002, "kl": 0.32379150390625, "lambda_div_used": 0.6, "learning_rate": 1.804828558898332e-07, "loss": 0.0299, "reward": 0.3759459834545851, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3759459834545851, "reward_after_std": 0.7962896451354027, "reward_before_mean": 0.9632791336625814, "reward_before_std": 0.7153794188052416, "reward_change_max": 0.000562228262424469, "reward_change_mean": -0.5873331986367702, "reward_change_min": -0.9731578454375267, "reward_change_std": 0.38425926864147186, "reward_std": 0.7962896823883057, "rewards/cosine_scaled_reward": 0.04413955472409725, "rewards/format_reward": 0.8750000149011612, "step": 413 }, { "advantage_max": 1.6291797831654549, "advantage_mean": 4.346172255420555e-09, "advantage_min": -0.7556163519620895, "advantage_std": 0.8886192888021469, "completion_length": 3423.291748046875, "epoch": 0.47314285714285714, "grad_norm": 1.426332712173462, "kl": 0.516357421875, "lambda_div_used": 0.6, "learning_rate": 1.7869892577476722e-07, "loss": 0.0101, "reward": 0.11338039068505168, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11338039068505168, "reward_after_std": 0.8886192999780178, "reward_before_mean": 0.540803493000567, "reward_before_std": 0.8775491826236248, "reward_change_max": 0.0, "reward_change_mean": -0.42742311395704746, "reward_change_min": -0.8786544650793076, "reward_change_std": 0.33593827672302723, "reward_std": 0.8886192999780178, "rewards/cosine_scaled_reward": -0.0525149138411507, "rewards/format_reward": 0.6458333376795053, "step": 414 }, { "advantage_max": 1.3992087170481682, "advantage_mean": -1.4280280180578586e-08, "advantage_min": -0.8285434059798717, "advantage_std": 0.794212706387043, "completion_length": 3180.5625610351562, "epoch": 0.4742857142857143, "grad_norm": 0.5881298184394836, "kl": 0.4810791015625, "lambda_div_used": 0.6, "learning_rate": 1.7693309235023127e-07, "loss": 0.0677, "reward": 0.20738966763019562, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.20738966763019562, "reward_after_std": 0.7942127101123333, "reward_before_mean": 0.7095146963838488, "reward_before_std": 0.7941083908081055, "reward_change_max": 0.0004946514964103699, "reward_change_mean": -0.5021250303834677, "reward_change_min": -0.9012851640582085, "reward_change_std": 0.3629884757101536, "reward_std": 0.7942127585411072, "rewards/cosine_scaled_reward": 0.011007343418896198, "rewards/format_reward": 0.6875000037252903, "step": 415 }, { "advantage_max": 1.5969299972057343, "advantage_mean": -4.346172310931706e-09, "advantage_min": -0.7314600218087435, "advantage_std": 0.8682622611522675, "completion_length": 2351.0417098999023, "epoch": 0.4754285714285714, "grad_norm": 0.619637668132782, "kl": 0.21923828125, "lambda_div_used": 0.6, "learning_rate": 1.7518544168045524e-07, "loss": 0.017, "reward": 0.44899107329547405, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.44899107329547405, "reward_after_std": 0.8682622611522675, "reward_before_mean": 1.062957238405943, "reward_before_std": 0.7723662834614515, "reward_change_max": 0.0, "reward_change_mean": -0.6139661446213722, "reward_change_min": -1.0317911952733994, "reward_change_std": 0.40039036609232426, "reward_std": 0.8682622760534286, "rewards/cosine_scaled_reward": 0.0627285810187459, "rewards/format_reward": 0.9375000074505806, "step": 416 }, { "advantage_max": 1.364932507276535, "advantage_mean": 0.0, "advantage_min": -0.7825092300772667, "advantage_std": 0.7713565900921822, "completion_length": 3244.5625610351562, "epoch": 0.4765714285714286, "grad_norm": 0.9164711833000183, "kl": 0.445068359375, "lambda_div_used": 0.6, "learning_rate": 1.7345605894346726e-07, "loss": 0.0304, "reward": 0.14966288317373255, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.14966288317373255, "reward_after_std": 0.7713565900921822, "reward_before_mean": 0.6202793065458536, "reward_before_std": 0.7590491138398647, "reward_change_max": 0.0, "reward_change_mean": -0.4706163890659809, "reward_change_min": -0.8597500771284103, "reward_change_std": 0.3365605156868696, "reward_std": 0.771356612443924, "rewards/cosine_scaled_reward": -0.0752770397812128, "rewards/format_reward": 0.770833358168602, "step": 417 }, { "advantage_max": 1.712221696972847, "advantage_mean": -2.2972624136308184e-08, "advantage_min": -0.925393857061863, "advantage_std": 0.9599899351596832, "completion_length": 2048.1875381469727, "epoch": 0.4777142857142857, "grad_norm": 0.6422997117042542, "kl": 0.4207611083984375, "lambda_div_used": 0.6, "learning_rate": 1.7174502842694212e-07, "loss": -0.0047, "reward": 0.5533166616223752, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5533166616223752, "reward_after_std": 0.9599899277091026, "reward_before_mean": 1.2044505663216114, "reward_before_std": 0.9257478043437004, "reward_change_max": 0.0003671795129776001, "reward_change_mean": -0.6511338837444782, "reward_change_min": -1.1758594512939453, "reward_change_std": 0.4581292551010847, "reward_std": 0.9599899724125862, "rewards/cosine_scaled_reward": 0.2168085891753435, "rewards/format_reward": 0.7708333469927311, "step": 418 }, { "advantage_max": 1.504380777478218, "advantage_mean": -2.4214387106535895e-08, "advantage_min": -0.9585339762270451, "advantage_std": 0.8656284362077713, "completion_length": 2767.1250610351562, "epoch": 0.47885714285714287, "grad_norm": 1.0067801475524902, "kl": 0.3099365234375, "lambda_div_used": 0.6, "learning_rate": 1.7005243352409333e-07, "loss": -0.0183, "reward": 0.4650968345813453, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4650968345813453, "reward_after_std": 0.8656284473836422, "reward_before_mean": 1.0998233817517757, "reward_before_std": 0.8545772768557072, "reward_change_max": 0.0, "reward_change_mean": -0.6347265485674143, "reward_change_min": -1.069912202656269, "reward_change_std": 0.4303837288171053, "reward_std": 0.8656284883618355, "rewards/cosine_scaled_reward": 0.17491167411208153, "rewards/format_reward": 0.7500000223517418, "step": 419 }, { "advantage_max": 1.138153724372387, "advantage_mean": -9.313225912688239e-09, "advantage_min": -0.8338864184916019, "advantage_std": 0.6697599738836288, "completion_length": 2140.500057220459, "epoch": 0.48, "grad_norm": 0.5568791031837463, "kl": 0.23101806640625, "lambda_div_used": 0.6, "learning_rate": 1.6837835672960831e-07, "loss": 0.0396, "reward": 0.27711703907698393, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.27711703907698393, "reward_after_std": 0.6697599701583385, "reward_before_mean": 0.8464478626847267, "reward_before_std": 0.659028060734272, "reward_change_max": 0.0013652518391609192, "reward_change_mean": -0.5693307928740978, "reward_change_min": -0.9026694595813751, "reward_change_std": 0.3788332063704729, "reward_std": 0.6697599962353706, "rewards/cosine_scaled_reward": -0.0246927491389215, "rewards/format_reward": 0.8958333432674408, "step": 420 }, { "advantage_max": 1.6502467170357704, "advantage_mean": -6.829699028543246e-09, "advantage_min": -0.7769649140536785, "advantage_std": 0.8680305257439613, "completion_length": 2986.8333740234375, "epoch": 0.48114285714285715, "grad_norm": 0.6128113269805908, "kl": 0.376708984375, "lambda_div_used": 0.6, "learning_rate": 1.6672287963562852e-07, "loss": 0.047, "reward": 0.0061189401894807816, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0061189401894807816, "reward_after_std": 0.8680305257439613, "reward_before_mean": 0.36435882560908794, "reward_before_std": 0.8137674778699875, "reward_change_max": 0.0, "reward_change_mean": -0.3582398798316717, "reward_change_min": -0.6710369400680065, "reward_change_std": 0.2599798422306776, "reward_std": 0.8680305629968643, "rewards/cosine_scaled_reward": -0.16157058905810118, "rewards/format_reward": 0.6875000242143869, "step": 421 }, { "advantage_max": 1.508325882256031, "advantage_mean": -2.1109978765032622e-08, "advantage_min": -0.6916386522352695, "advantage_std": 0.812825795263052, "completion_length": 2900.9584045410156, "epoch": 0.48228571428571426, "grad_norm": 0.6628983616828918, "kl": 0.346527099609375, "lambda_div_used": 0.6, "learning_rate": 1.6508608292777203e-07, "loss": 0.0175, "reward": 0.3042160285403952, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3042160285403952, "reward_after_std": 0.812825795263052, "reward_before_mean": 0.8499894179403782, "reward_before_std": 0.7234699986875057, "reward_change_max": 0.0008436664938926697, "reward_change_mean": -0.5457733869552612, "reward_change_min": -0.9079204723238945, "reward_change_std": 0.3513330090790987, "reward_std": 0.8128258399665356, "rewards/cosine_scaled_reward": 0.00832803500816226, "rewards/format_reward": 0.8333333469927311, "step": 422 }, { "advantage_max": 1.5517828837037086, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -0.7369258608669043, "advantage_std": 0.8391496613621712, "completion_length": 2629.270896911621, "epoch": 0.48342857142857143, "grad_norm": 0.5332776308059692, "kl": 0.31683349609375, "lambda_div_used": 0.6, "learning_rate": 1.6346804638120098e-07, "loss": 0.0051, "reward": 0.09200821781996638, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09200821781996638, "reward_after_std": 0.8391496725380421, "reward_before_mean": 0.5115727577358484, "reward_before_std": 0.8018813747912645, "reward_change_max": 0.00034625083208084106, "reward_change_mean": -0.4195645246654749, "reward_change_min": -0.7326377555727959, "reward_change_std": 0.2982933344319463, "reward_std": 0.839149683713913, "rewards/cosine_scaled_reward": -0.10879695974290371, "rewards/format_reward": 0.7291666846722364, "step": 423 }, { "advantage_max": 1.4316673576831818, "advantage_mean": -7.450580818968433e-09, "advantage_min": -0.7059814631938934, "advantage_std": 0.7919131703674793, "completion_length": 3144.0208892822266, "epoch": 0.4845714285714286, "grad_norm": 1.1393442153930664, "kl": 0.509765625, "lambda_div_used": 0.6, "learning_rate": 1.6186884885673413e-07, "loss": 0.0139, "reward": -0.05595473758876324, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.05595473758876324, "reward_after_std": 0.7919131703674793, "reward_before_mean": 0.29102808563038707, "reward_before_std": 0.7984279748052359, "reward_change_max": 0.0006482526659965515, "reward_change_mean": -0.3469828423112631, "reward_change_min": -0.7688832432031631, "reward_change_std": 0.29653407260775566, "reward_std": 0.791913203895092, "rewards/cosine_scaled_reward": -0.1461526220664382, "rewards/format_reward": 0.583333345130086, "step": 424 }, { "advantage_max": 2.001866862177849, "advantage_mean": -2.9181442262604662e-08, "advantage_min": -0.9877432808279991, "advantage_std": 1.09125317633152, "completion_length": 2045.8542175292969, "epoch": 0.4857142857142857, "grad_norm": 1.4421943426132202, "kl": 0.1781005859375, "lambda_div_used": 0.6, "learning_rate": 1.6028856829700258e-07, "loss": 0.034, "reward": 0.9677936118096113, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.9677936118096113, "reward_after_std": 1.0912531986832619, "reward_before_mean": 1.8249291330575943, "reward_before_std": 0.9513313472270966, "reward_change_max": 2.5540590286254883e-05, "reward_change_mean": -0.8571355231106281, "reward_change_min": -1.4232430160045624, "reward_change_std": 0.5513026043772697, "reward_std": 1.091253213584423, "rewards/cosine_scaled_reward": 0.4749645469710231, "rewards/format_reward": 0.8750000074505806, "step": 425 }, { "advantage_max": 1.7256113290786743, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.8169574663043022, "advantage_std": 0.9189626127481461, "completion_length": 2349.7084045410156, "epoch": 0.4868571428571429, "grad_norm": 0.7599732875823975, "kl": 0.3668975830078125, "lambda_div_used": 0.6, "learning_rate": 1.5872728172265146e-07, "loss": 0.0176, "reward": 0.4623431172221899, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4623431172221899, "reward_after_std": 0.9189625754952431, "reward_before_mean": 1.061521541327238, "reward_before_std": 0.8109705522656441, "reward_change_max": 0.0, "reward_change_mean": -0.5991784110665321, "reward_change_min": -1.0059629045426846, "reward_change_std": 0.38436930999159813, "reward_std": 0.9189625829458237, "rewards/cosine_scaled_reward": 0.1140940950717777, "rewards/format_reward": 0.8333333507180214, "step": 426 }, { "advantage_max": 1.1098849326372147, "advantage_mean": 5.2774946912581555e-09, "advantage_min": -0.4467169027775526, "advantage_std": 0.5858809929341078, "completion_length": 2994.5209045410156, "epoch": 0.488, "grad_norm": 0.7700356841087341, "kl": 0.361083984375, "lambda_div_used": 0.6, "learning_rate": 1.5718506522858572e-07, "loss": 0.0261, "reward": 0.219692911952734, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.219692911952734, "reward_after_std": 0.5858809780329466, "reward_before_mean": 0.7604799484834075, "reward_before_std": 0.42057891469448805, "reward_change_max": 0.0, "reward_change_mean": -0.5407870132476091, "reward_change_min": -0.8110893554985523, "reward_change_std": 0.3113178089261055, "reward_std": 0.5858809947967529, "rewards/cosine_scaled_reward": -0.015593377873301506, "rewards/format_reward": 0.7916666846722364, "step": 427 }, { "advantage_max": 1.5108700022101402, "advantage_mean": 8.692344288796505e-09, "advantage_min": -0.7426536791026592, "advantage_std": 0.8018022254109383, "completion_length": 2830.979217529297, "epoch": 0.48914285714285716, "grad_norm": 0.6052858829498291, "kl": 0.4901123046875, "lambda_div_used": 0.6, "learning_rate": 1.5566199398026147e-07, "loss": 0.0524, "reward": 0.004844090901315212, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.004844090901315212, "reward_after_std": 0.80180224776268, "reward_before_mean": 0.37800159538164735, "reward_before_std": 0.7561859395354986, "reward_change_max": 0.0, "reward_change_mean": -0.37315749377012253, "reward_change_min": -0.7145323418080807, "reward_change_std": 0.2716970220208168, "reward_std": 0.8018022775650024, "rewards/cosine_scaled_reward": -0.17558254208415747, "rewards/format_reward": 0.729166692122817, "step": 428 }, { "advantage_max": 1.5547840893268585, "advantage_mean": 3.1044086745701804e-09, "advantage_min": -0.8601616211235523, "advantage_std": 0.8643078021705151, "completion_length": 2310.000072479248, "epoch": 0.49028571428571427, "grad_norm": 0.81473708152771, "kl": 0.19464111328125, "lambda_div_used": 0.6, "learning_rate": 1.5415814221002265e-07, "loss": 0.0051, "reward": 0.29873619112186134, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.29873619112186134, "reward_after_std": 0.8643078021705151, "reward_before_mean": 0.8367813774384558, "reward_before_std": 0.8450817838311195, "reward_change_max": 0.00034783780574798584, "reward_change_mean": -0.5380451511591673, "reward_change_min": -1.0198836103081703, "reward_change_std": 0.3816765770316124, "reward_std": 0.8643078207969666, "rewards/cosine_scaled_reward": 0.0017240047454833984, "rewards/format_reward": 0.8333333469927311, "step": 429 }, { "advantage_max": 1.532344713807106, "advantage_mean": -1.8626450937198058e-09, "advantage_min": -0.5167469158768654, "advantage_std": 0.7778219282627106, "completion_length": 2256.250045776367, "epoch": 0.49142857142857144, "grad_norm": 0.7217494249343872, "kl": 0.24468994140625, "lambda_div_used": 0.6, "learning_rate": 1.5267358321348285e-07, "loss": 0.0228, "reward": 0.2548238287563436, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2548238287563436, "reward_after_std": 0.7778219506144524, "reward_before_mean": 0.769319811835885, "reward_before_std": 0.6003479808568954, "reward_change_max": 0.0004530102014541626, "reward_change_mean": -0.5144959762692451, "reward_change_min": -0.7555083595216274, "reward_change_std": 0.28606732469052076, "reward_std": 0.7778219655156136, "rewards/cosine_scaled_reward": 0.040909904055297375, "rewards/format_reward": 0.6875000055879354, "step": 430 }, { "advantage_max": 1.2318995967507362, "advantage_mean": -1.024454859832602e-08, "advantage_min": -0.6241239868104458, "advantage_std": 0.6798490583896637, "completion_length": 2692.979263305664, "epoch": 0.49257142857142855, "grad_norm": 0.409150630235672, "kl": 0.30722808837890625, "lambda_div_used": 0.6, "learning_rate": 1.5120838934595337e-07, "loss": 0.032, "reward": 0.08035399056097958, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08035399056097958, "reward_after_std": 0.6798490472137928, "reward_before_mean": 0.5355471428483725, "reward_before_std": 0.6428811736404896, "reward_change_max": 0.0008766204118728638, "reward_change_mean": -0.4551931694149971, "reward_change_min": -0.8111777156591415, "reward_change_std": 0.31029749289155006, "reward_std": 0.679849062114954, "rewards/cosine_scaled_reward": -0.18014310486614704, "rewards/format_reward": 0.8958333432674408, "step": 431 }, { "advantage_max": 1.5041551887989044, "advantage_mean": -9.313226023710541e-09, "advantage_min": -0.660958144813776, "advantage_std": 0.7954925112426281, "completion_length": 3086.354248046875, "epoch": 0.4937142857142857, "grad_norm": 0.9090562462806702, "kl": 0.4284820556640625, "lambda_div_used": 0.6, "learning_rate": 1.4976263201891613e-07, "loss": 0.0191, "reward": 0.15131173096597195, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15131173096597195, "reward_after_std": 0.7954924888908863, "reward_before_mean": 0.6126590967178345, "reward_before_std": 0.7049643620848656, "reward_change_max": 1.9669532775878906e-06, "reward_change_mean": -0.4613474104553461, "reward_change_min": -0.7781161703169346, "reward_change_std": 0.2984175104647875, "reward_std": 0.7954925149679184, "rewards/cosine_scaled_reward": -0.037420436972752213, "rewards/format_reward": 0.6875000111758709, "step": 432 }, { "advantage_max": 1.669169820845127, "advantage_mean": -6.208817127095756e-09, "advantage_min": -0.726369071751833, "advantage_std": 0.875070009380579, "completion_length": 2916.4792251586914, "epoch": 0.4948571428571429, "grad_norm": 0.7531116008758545, "kl": 0.50775146484375, "lambda_div_used": 0.6, "learning_rate": 1.483363816965435e-07, "loss": 0.0138, "reward": 0.38012822410382796, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.38012822410382796, "reward_after_std": 0.8750699795782566, "reward_before_mean": 0.9496738724410534, "reward_before_std": 0.7544910423457623, "reward_change_max": 0.0009396150708198547, "reward_change_mean": -0.5695456936955452, "reward_change_min": -0.8935679048299789, "reward_change_std": 0.34632328897714615, "reward_std": 0.8750700131058693, "rewards/cosine_scaled_reward": 0.06858693342655897, "rewards/format_reward": 0.8125000149011612, "step": 433 }, { "advantage_max": 1.0035743713378906, "advantage_mean": 5.58793539218172e-09, "advantage_min": -0.407320324331522, "advantage_std": 0.5333701260387897, "completion_length": 3139.812545776367, "epoch": 0.496, "grad_norm": 2.604379653930664, "kl": 0.579833984375, "lambda_div_used": 0.6, "learning_rate": 1.469297078922642e-07, "loss": -0.0146, "reward": -0.09332448849454522, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.09332448849454522, "reward_after_std": 0.53337012976408, "reward_before_mean": 0.28518972732126713, "reward_before_std": 0.46411832980811596, "reward_change_max": 0.0, "reward_change_mean": -0.3785142097622156, "reward_change_min": -0.6618387140333652, "reward_change_std": 0.23844915814697742, "reward_std": 0.5333701446652412, "rewards/cosine_scaled_reward": -0.23240514658391476, "rewards/format_reward": 0.750000013038516, "step": 434 }, { "advantage_max": 1.3506320640444756, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.7499644756317139, "advantage_std": 0.735443152487278, "completion_length": 2099.187545776367, "epoch": 0.49714285714285716, "grad_norm": 0.4415164291858673, "kl": 0.31992340087890625, "lambda_div_used": 0.6, "learning_rate": 1.4554267916537495e-07, "loss": 0.0248, "reward": 0.22127506462857127, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22127506462857127, "reward_after_std": 0.7354431487619877, "reward_before_mean": 0.7249867599457502, "reward_before_std": 0.6692739687860012, "reward_change_max": 0.00029733777046203613, "reward_change_mean": -0.5037116818130016, "reward_change_min": -0.8023575022816658, "reward_change_std": 0.3164084330201149, "reward_std": 0.735443152487278, "rewards/cosine_scaled_reward": -0.05417329433839768, "rewards/format_reward": 0.8333333469927311, "step": 435 }, { "advantage_max": 1.5958310216665268, "advantage_mean": 7.771561172376096e-16, "advantage_min": -0.7120724134147167, "advantage_std": 0.8530858978629112, "completion_length": 2177.645866394043, "epoch": 0.4982857142857143, "grad_norm": 0.43583664298057556, "kl": 0.21563720703125, "lambda_div_used": 0.6, "learning_rate": 1.4417536311769885e-07, "loss": 0.005, "reward": 0.5215210448950529, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5215210448950529, "reward_after_std": 0.85308588296175, "reward_before_mean": 1.1749990084208548, "reward_before_std": 0.7143463343381882, "reward_change_max": 0.0, "reward_change_mean": -0.6534778997302055, "reward_change_min": -1.098868913948536, "reward_change_std": 0.4049825519323349, "reward_std": 0.8530858978629112, "rewards/cosine_scaled_reward": 0.149999488145113, "rewards/format_reward": 0.875, "step": 436 }, { "advantage_max": 1.501756675541401, "advantage_mean": -9.93410742555767e-09, "advantage_min": -0.7757223770022392, "advantage_std": 0.8331665247678757, "completion_length": 3171.3959045410156, "epoch": 0.49942857142857144, "grad_norm": 0.7524950504302979, "kl": 0.406005859375, "lambda_div_used": 0.6, "learning_rate": 1.4282782639029128e-07, "loss": 0.0419, "reward": 0.14843263989314437, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.14843263989314437, "reward_after_std": 0.8331665396690369, "reward_before_mean": 0.60943434946239, "reward_before_std": 0.8200727887451649, "reward_change_max": 5.303323268890381e-05, "reward_change_mean": -0.46100171096622944, "reward_change_min": -0.8599130436778069, "reward_change_std": 0.33630202896893024, "reward_std": 0.8331665582954884, "rewards/cosine_scaled_reward": -0.09111617412418127, "rewards/format_reward": 0.7916666865348816, "step": 437 }, { "advantage_max": 1.5247759148478508, "advantage_mean": -3.1044086745701804e-09, "advantage_min": -0.624311737716198, "advantage_std": 0.7884609997272491, "completion_length": 2956.6250762939453, "epoch": 0.5005714285714286, "grad_norm": 0.3807191550731659, "kl": 0.3555755615234375, "lambda_div_used": 0.6, "learning_rate": 1.4150013466019114e-07, "loss": 0.0229, "reward": 0.012609760742634535, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.012609760742634535, "reward_after_std": 0.7884609997272491, "reward_before_mean": 0.3945481926202774, "reward_before_std": 0.6936755180358887, "reward_change_max": 0.0006226003170013428, "reward_change_mean": -0.3819384379312396, "reward_change_min": -0.6387332938611507, "reward_change_std": 0.25689230114221573, "reward_std": 0.7884610146284103, "rewards/cosine_scaled_reward": -0.12564256973564625, "rewards/format_reward": 0.6458333525806665, "step": 438 }, { "advantage_max": 1.304104894399643, "advantage_mean": 7.450580985501887e-09, "advantage_min": -0.747392974793911, "advantage_std": 0.7265794835984707, "completion_length": 2621.979202270508, "epoch": 0.5017142857142857, "grad_norm": 0.5890460014343262, "kl": 0.321441650390625, "lambda_div_used": 0.6, "learning_rate": 1.4019235263722034e-07, "loss": 0.0219, "reward": 0.08033835608512163, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08033835608512163, "reward_after_std": 0.7265794575214386, "reward_before_mean": 0.5202324008569121, "reward_before_std": 0.7163271754980087, "reward_change_max": 0.0, "reward_change_mean": -0.43989402055740356, "reward_change_min": -0.7937644794583321, "reward_change_std": 0.3140565250068903, "reward_std": 0.7265795208513737, "rewards/cosine_scaled_reward": -0.10446714423596859, "rewards/format_reward": 0.7291666772216558, "step": 439 }, { "advantage_max": 1.4373757094144821, "advantage_mean": -1.7384688633104162e-08, "advantage_min": -0.7765031270682812, "advantage_std": 0.7870672270655632, "completion_length": 3122.6458587646484, "epoch": 0.5028571428571429, "grad_norm": 0.5948376059532166, "kl": 0.41351318359375, "lambda_div_used": 0.6, "learning_rate": 1.3890454406082956e-07, "loss": 0.0331, "reward": -0.030132897198200226, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.030132897198200226, "reward_after_std": 0.7870672382414341, "reward_before_mean": 0.3378842771053314, "reward_before_std": 0.7818763107061386, "reward_change_max": 0.0, "reward_change_mean": -0.36801718175411224, "reward_change_min": -0.6863149777054787, "reward_change_std": 0.29010715894401073, "reward_std": 0.7870672680437565, "rewards/cosine_scaled_reward": -0.1227245363406837, "rewards/format_reward": 0.5833333525806665, "step": 440 }, { "advantage_max": 1.5642970129847527, "advantage_mean": -6.053596901534064e-09, "advantage_min": -0.9813426993787289, "advantage_std": 0.8995516113936901, "completion_length": 2796.166748046875, "epoch": 0.504, "grad_norm": 0.7474466562271118, "kl": 0.31884765625, "lambda_div_used": 0.6, "learning_rate": 1.3763677169699217e-07, "loss": 0.0371, "reward": 0.38214205752592534, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.38214205752592534, "reward_after_std": 0.8995516039431095, "reward_before_mean": 0.9629887044429779, "reward_before_std": 0.913324523717165, "reward_change_max": 0.0, "reward_change_mean": -0.5808466412127018, "reward_change_min": -1.0513642355799675, "reward_change_std": 0.4244292415678501, "reward_std": 0.8995516337454319, "rewards/cosine_scaled_reward": 0.08566100802272558, "rewards/format_reward": 0.791666679084301, "step": 441 }, { "advantage_max": 1.5939322486519814, "advantage_mean": -2.7318796558262193e-08, "advantage_min": -0.7752449028193951, "advantage_std": 0.8887365721166134, "completion_length": 2893.375045776367, "epoch": 0.5051428571428571, "grad_norm": 0.7434902787208557, "kl": 0.27886962890625, "lambda_div_used": 0.6, "learning_rate": 1.3638909733514452e-07, "loss": 0.0234, "reward": 0.3462903336621821, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3462903336621821, "reward_after_std": 0.8887365981936455, "reward_before_mean": 0.8997693937271833, "reward_before_std": 0.8497613854706287, "reward_change_max": 0.0006150901317596436, "reward_change_mean": -0.5534790195524693, "reward_change_min": -1.053729109466076, "reward_change_std": 0.3987939488142729, "reward_std": 0.8887366205453873, "rewards/cosine_scaled_reward": 0.022801332794188056, "rewards/format_reward": 0.8541666716337204, "step": 442 }, { "advantage_max": 1.4239144548773766, "advantage_mean": 7.450580596923828e-09, "advantage_min": -0.7238317169249058, "advantage_std": 0.7982291206717491, "completion_length": 3030.979217529297, "epoch": 0.5062857142857143, "grad_norm": 0.6095349788665771, "kl": 0.32720947265625, "lambda_div_used": 0.6, "learning_rate": 1.351615817851748e-07, "loss": 0.0193, "reward": 0.0006824731826782227, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0006824731826782227, "reward_after_std": 0.7982290983200073, "reward_before_mean": 0.38761329650878906, "reward_before_std": 0.8135102391242981, "reward_change_max": 0.0008568018674850464, "reward_change_mean": -0.38693082239478827, "reward_change_min": -0.7811053469777107, "reward_change_std": 0.3120748773217201, "reward_std": 0.7982291206717491, "rewards/cosine_scaled_reward": -0.10827669268473983, "rewards/format_reward": 0.604166679084301, "step": 443 }, { "advantage_max": 1.024867869913578, "advantage_mean": -4.346171755820194e-09, "advantage_min": -0.53191352263093, "advantage_std": 0.5605506096035242, "completion_length": 2855.562545776367, "epoch": 0.5074285714285715, "grad_norm": 0.5939316749572754, "kl": 0.3170013427734375, "lambda_div_used": 0.6, "learning_rate": 1.3395428487445914e-07, "loss": 0.0085, "reward": 0.0003267568536102772, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0003267568536102772, "reward_after_std": 0.5605506096035242, "reward_before_mean": 0.4287848509848118, "reward_before_std": 0.5058968178927898, "reward_change_max": 0.00040687620639801025, "reward_change_mean": -0.42845806665718555, "reward_change_min": -0.7199537493288517, "reward_change_std": 0.26975464215502143, "reward_std": 0.5605506226420403, "rewards/cosine_scaled_reward": -0.09810760384425521, "rewards/format_reward": 0.6250000037252903, "step": 444 }, { "advantage_max": 1.4808774963021278, "advantage_mean": -3.1044082304809706e-09, "advantage_min": -0.7681502997875214, "advantage_std": 0.8168426752090454, "completion_length": 2854.666748046875, "epoch": 0.5085714285714286, "grad_norm": 0.6682929396629333, "kl": 0.23166656494140625, "lambda_div_used": 0.6, "learning_rate": 1.3276726544494571e-07, "loss": 0.0271, "reward": 0.3290069226641208, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3290069226641208, "reward_after_std": 0.8168426714837551, "reward_before_mean": 0.8914229711517692, "reward_before_std": 0.7797074243426323, "reward_change_max": 0.0006837695837020874, "reward_change_mean": -0.5624160580337048, "reward_change_min": -0.9781622253358364, "reward_change_std": 0.37248685862869024, "reward_std": 0.8168426789343357, "rewards/cosine_scaled_reward": 0.04987816256470978, "rewards/format_reward": 0.7916666697710752, "step": 445 }, { "advantage_max": 1.4862676709890366, "advantage_mean": -1.2417634698280722e-09, "advantage_min": -0.939953550696373, "advantage_std": 0.8497539050877094, "completion_length": 2677.9792709350586, "epoch": 0.5097142857142857, "grad_norm": 1.6928560733795166, "kl": 0.21446990966796875, "lambda_div_used": 0.6, "learning_rate": 1.316005813502869e-07, "loss": 0.069, "reward": 0.334447986446321, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.334447986446321, "reward_after_std": 0.8497539088129997, "reward_before_mean": 0.8993029966950417, "reward_before_std": 0.8538679517805576, "reward_change_max": 0.0010942071676254272, "reward_change_mean": -0.564854983240366, "reward_change_min": -0.9794363453984261, "reward_change_std": 0.40965361148118973, "reward_std": 0.8497539162635803, "rewards/cosine_scaled_reward": 0.07465148530900478, "rewards/format_reward": 0.7500000223517418, "step": 446 }, { "advantage_max": 1.5106425508856773, "advantage_mean": -7.761021714181027e-09, "advantage_min": -0.7425118647515774, "advantage_std": 0.8140601627528667, "completion_length": 2409.500030517578, "epoch": 0.5108571428571429, "grad_norm": 0.5263113379478455, "kl": 0.16021728515625, "lambda_div_used": 0.6, "learning_rate": 1.3045428945301953e-07, "loss": 0.0239, "reward": 0.3240194395184517, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3240194395184517, "reward_after_std": 0.8140601515769958, "reward_before_mean": 0.8813347518444061, "reward_before_std": 0.7404376417398453, "reward_change_max": 0.00041799992322921753, "reward_change_mean": -0.5573153309524059, "reward_change_min": -0.9619341045618057, "reward_change_std": 0.3613274283707142, "reward_std": 0.8140601553022861, "rewards/cosine_scaled_reward": -0.007249297806993127, "rewards/format_reward": 0.8958333395421505, "step": 447 }, { "advantage_max": 1.4709734171628952, "advantage_mean": 7.450580818968433e-09, "advantage_min": -0.6773672588169575, "advantage_std": 0.7775681540369987, "completion_length": 2605.7708587646484, "epoch": 0.512, "grad_norm": 1.9863547086715698, "kl": 0.26190185546875, "lambda_div_used": 0.6, "learning_rate": 1.2932844562179352e-07, "loss": -0.0291, "reward": 0.2725476995110512, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2725476995110512, "reward_after_std": 0.7775681465864182, "reward_before_mean": 0.8013585917651653, "reward_before_std": 0.6695942915976048, "reward_change_max": 0.0006482377648353577, "reward_change_mean": -0.528810883872211, "reward_change_min": -0.8442369252443314, "reward_change_std": 0.3180638235062361, "reward_std": 0.7775681652128696, "rewards/cosine_scaled_reward": 0.004845963791012764, "rewards/format_reward": 0.7916666716337204, "step": 448 }, { "advantage_max": 1.454092152416706, "advantage_mean": -1.738468857759301e-08, "advantage_min": -0.9398189447820187, "advantage_std": 0.833489615470171, "completion_length": 2516.812515258789, "epoch": 0.5131428571428571, "grad_norm": 0.5418146848678589, "kl": 0.190399169921875, "lambda_div_used": 0.6, "learning_rate": 1.2822310472864885e-07, "loss": 0.0099, "reward": 0.20460259914398193, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20460259914398193, "reward_after_std": 0.833489615470171, "reward_before_mean": 0.7024632133543491, "reward_before_std": 0.8663576692342758, "reward_change_max": 0.00025576353073120117, "reward_change_mean": -0.49786060582846403, "reward_change_min": -0.8856289610266685, "reward_change_std": 0.3694376861676574, "reward_std": 0.8334896489977837, "rewards/cosine_scaled_reward": 0.01789826713502407, "rewards/format_reward": 0.6666666828095913, "step": 449 }, { "advantage_max": 1.2588047087192535, "advantage_mean": 2.1730859889323995e-09, "advantage_min": -0.662886805832386, "advantage_std": 0.6904132477939129, "completion_length": 2910.479202270508, "epoch": 0.5142857142857142, "grad_norm": 0.738971471786499, "kl": 0.3026580810546875, "lambda_div_used": 0.6, "learning_rate": 1.2713832064634125e-07, "loss": 0.0117, "reward": 0.1044841951224953, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1044841951224953, "reward_after_std": 0.6904132552444935, "reward_before_mean": 0.5679622534662485, "reward_before_std": 0.641492698341608, "reward_change_max": 0.0002985745668411255, "reward_change_mean": -0.4634780492633581, "reward_change_min": -0.8120031580328941, "reward_change_std": 0.31523124873638153, "reward_std": 0.6904132626950741, "rewards/cosine_scaled_reward": -0.04935221001505852, "rewards/format_reward": 0.6666666734963655, "step": 450 }, { "advantage_max": 1.7307595536112785, "advantage_mean": -3.290673183942161e-08, "advantage_min": -0.9142901822924614, "advantage_std": 0.9461825527250767, "completion_length": 2641.375015258789, "epoch": 0.5154285714285715, "grad_norm": 1.169973611831665, "kl": 0.228729248046875, "lambda_div_used": 0.6, "learning_rate": 1.260741462457165e-07, "loss": 0.0487, "reward": 0.4206886999309063, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4206886999309063, "reward_after_std": 0.9461825452744961, "reward_before_mean": 1.009432639926672, "reward_before_std": 0.8840302973985672, "reward_change_max": 0.0003242567181587219, "reward_change_mean": -0.5887439749203622, "reward_change_min": -0.9866149947047234, "reward_change_std": 0.3979658451862633, "reward_std": 0.946182556450367, "rewards/cosine_scaled_reward": 0.16096631158143282, "rewards/format_reward": 0.687500013038516, "step": 451 }, { "advantage_max": 1.4747585132718086, "advantage_mean": 3.725290076417309e-09, "advantage_min": -0.5936764031648636, "advantage_std": 0.7980528734624386, "completion_length": 3164.729248046875, "epoch": 0.5165714285714286, "grad_norm": 1.0744494199752808, "kl": 0.3144378662109375, "lambda_div_used": 0.6, "learning_rate": 1.2503063339313356e-07, "loss": 0.0404, "reward": 0.020421676337718964, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.020421676337718964, "reward_after_std": 0.798052866011858, "reward_before_mean": 0.414817335549742, "reward_before_std": 0.7678319737315178, "reward_change_max": 0.0007721483707427979, "reward_change_mean": -0.3943956736475229, "reward_change_min": -0.7912264950573444, "reward_change_std": 0.29552899673581123, "reward_std": 0.7980528771877289, "rewards/cosine_scaled_reward": -0.011341335251927376, "rewards/format_reward": 0.4375000037252903, "step": 452 }, { "advantage_max": 1.5293823331594467, "advantage_mean": -1.6142925107764938e-08, "advantage_min": -0.914829894900322, "advantage_std": 0.8372916430234909, "completion_length": 2751.4792098999023, "epoch": 0.5177142857142857, "grad_norm": 0.5472725033760071, "kl": 0.28240966796875, "lambda_div_used": 0.6, "learning_rate": 1.2400783294793668e-07, "loss": 0.0214, "reward": 0.35070702666416764, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.35070702666416764, "reward_after_std": 0.8372916504740715, "reward_before_mean": 0.9138406561687589, "reward_before_std": 0.7884054854512215, "reward_change_max": 0.0006369054317474365, "reward_change_mean": -0.5631336495280266, "reward_change_min": -0.9314819648861885, "reward_change_std": 0.3668592553585768, "reward_std": 0.8372916728258133, "rewards/cosine_scaled_reward": 0.08192032761871815, "rewards/format_reward": 0.7500000149011612, "step": 453 }, { "advantage_max": 1.4231568723917007, "advantage_mean": -1.9247334115402026e-08, "advantage_min": -0.832726463675499, "advantage_std": 0.793323565274477, "completion_length": 2578.4375610351562, "epoch": 0.5188571428571429, "grad_norm": 1.011570692062378, "kl": 0.17710113525390625, "lambda_div_used": 0.6, "learning_rate": 1.2300579475997657e-07, "loss": 0.0356, "reward": 0.23797382973134518, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23797382973134518, "reward_after_std": 0.7933235578238964, "reward_before_mean": 0.7599361706525087, "reward_before_std": 0.7715496458113194, "reward_change_max": 0.0, "reward_change_mean": -0.5219623260200024, "reward_change_min": -0.96772700548172, "reward_change_std": 0.3597046695649624, "reward_std": 0.7933235876262188, "rewards/cosine_scaled_reward": -0.01586526818573475, "rewards/format_reward": 0.7916666828095913, "step": 454 }, { "advantage_max": 1.1031616851687431, "advantage_mean": 1.4590720687213121e-08, "advantage_min": -0.5992041826248169, "advantage_std": 0.6113407611846924, "completion_length": 3231.7709350585938, "epoch": 0.52, "grad_norm": 1.4600268602371216, "kl": 0.35595703125, "lambda_div_used": 0.6, "learning_rate": 1.220245676671809e-07, "loss": -0.0044, "reward": 0.026436822256073356, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.026436822256073356, "reward_after_std": 0.6113407723605633, "reward_before_mean": 0.46537392027676105, "reward_before_std": 0.5697146020829678, "reward_change_max": 0.0, "reward_change_mean": -0.43893709033727646, "reward_change_min": -0.7486898303031921, "reward_change_std": 0.28901602514088154, "reward_std": 0.6113407835364342, "rewards/cosine_scaled_reward": -0.12147971335798502, "rewards/format_reward": 0.7083333395421505, "step": 455 }, { "advantage_max": 1.0907672867178917, "advantage_mean": -1.1796753296433593e-08, "advantage_min": -0.6506773978471756, "advantage_std": 0.6137639144435525, "completion_length": 2944.854217529297, "epoch": 0.5211428571428571, "grad_norm": 0.48930859565734863, "kl": 0.28680419921875, "lambda_div_used": 0.6, "learning_rate": 1.2106419949317388e-07, "loss": 0.012, "reward": 0.017696384878945537, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.017696384878945537, "reward_after_std": 0.6137639237567782, "reward_before_mean": 0.44923659786581993, "reward_before_std": 0.5944715887308121, "reward_change_max": 0.000657103955745697, "reward_change_mean": -0.4315402414649725, "reward_change_min": -0.7416375353932381, "reward_change_std": 0.29969449946656823, "reward_std": 0.6137639349326491, "rewards/cosine_scaled_reward": -0.09829837083816528, "rewards/format_reward": 0.6458333488553762, "step": 456 }, { "advantage_max": 1.2821914702653885, "advantage_mean": 5.5879355587151736e-09, "advantage_min": -0.6453388333320618, "advantage_std": 0.7012712173163891, "completion_length": 2911.1875762939453, "epoch": 0.5222857142857142, "grad_norm": 0.5257925391197205, "kl": 0.2884521484375, "lambda_div_used": 0.6, "learning_rate": 1.2012473704494537e-07, "loss": 0.0062, "reward": 0.1664595603942871, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1664595603942871, "reward_after_std": 0.7012712173163891, "reward_before_mean": 0.6591437275055796, "reward_before_std": 0.6436357796192169, "reward_change_max": 0.0, "reward_change_mean": -0.49268414825201035, "reward_change_min": -0.8301557153463364, "reward_change_std": 0.3239094000309706, "reward_std": 0.70127122849226, "rewards/cosine_scaled_reward": -0.003761494532227516, "rewards/format_reward": 0.6666666828095913, "step": 457 }, { "advantage_max": 1.6441849321126938, "advantage_mean": 9.313226079221693e-09, "advantage_min": -0.7769365385174751, "advantage_std": 0.8987438268959522, "completion_length": 2578.0208740234375, "epoch": 0.5234285714285715, "grad_norm": 1.0878689289093018, "kl": 0.279937744140625, "lambda_div_used": 0.6, "learning_rate": 1.1920622611056974e-07, "loss": 0.0544, "reward": 0.027010804347810335, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.027010804347810335, "reward_after_std": 0.8987438231706619, "reward_before_mean": 0.40003128722310066, "reward_before_std": 0.9050403535366058, "reward_change_max": 0.0013420060276985168, "reward_change_mean": -0.37302049063146114, "reward_change_min": -0.7793563231825829, "reward_change_std": 0.30434579588472843, "reward_std": 0.8987438529729843, "rewards/cosine_scaled_reward": -0.14373437548056245, "rewards/format_reward": 0.6875000186264515, "step": 458 }, { "advantage_max": 1.384127452969551, "advantage_mean": -2.4835269396561444e-08, "advantage_min": -0.697060227394104, "advantage_std": 0.7580831982195377, "completion_length": 2400.2708892822266, "epoch": 0.5245714285714286, "grad_norm": 0.5284119844436646, "kl": 0.16412353515625, "lambda_div_used": 0.6, "learning_rate": 1.1830871145697412e-07, "loss": 0.013, "reward": 0.5363429971039295, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5363429971039295, "reward_after_std": 0.7580831833183765, "reward_before_mean": 1.2248572246171534, "reward_before_std": 0.6392851062119007, "reward_change_max": 0.0005814731121063232, "reward_change_mean": -0.688514206558466, "reward_change_min": -1.1085792183876038, "reward_change_std": 0.4158451687544584, "reward_std": 0.7580831982195377, "rewards/cosine_scaled_reward": 0.20617857947945595, "rewards/format_reward": 0.8125000055879354, "step": 459 }, { "advantage_max": 1.4096355810761452, "advantage_mean": 1.552204503818544e-09, "advantage_min": -0.7541556470096111, "advantage_std": 0.7679723687469959, "completion_length": 3067.708450317383, "epoch": 0.5257142857142857, "grad_norm": 0.5955216288566589, "kl": 0.3953094482421875, "lambda_div_used": 0.6, "learning_rate": 1.1743223682775649e-07, "loss": 0.0453, "reward": 0.04936068008100847, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04936068008100847, "reward_after_std": 0.7679723650217056, "reward_before_mean": 0.4637445118278265, "reward_before_std": 0.7469018138945103, "reward_change_max": 0.0, "reward_change_mean": -0.41438381001353264, "reward_change_min": -0.7415559887886047, "reward_change_std": 0.3010368328541517, "reward_std": 0.7679724022746086, "rewards/cosine_scaled_reward": -0.12229442107491195, "rewards/format_reward": 0.7083333395421505, "step": 460 }, { "advantage_max": 1.6185405403375626, "advantage_mean": -1.3659398279131096e-08, "advantage_min": -0.8021063134074211, "advantage_std": 0.8745506145060062, "completion_length": 3010.8750915527344, "epoch": 0.5268571428571428, "grad_norm": 0.9823641777038574, "kl": 0.326171875, "lambda_div_used": 0.6, "learning_rate": 1.1657684494105386e-07, "loss": 0.0702, "reward": 0.17612256668508053, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17612256668508053, "reward_after_std": 0.8745506182312965, "reward_before_mean": 0.6361170820891857, "reward_before_std": 0.8338725045323372, "reward_change_max": 0.0004089474678039551, "reward_change_mean": -0.4599945154041052, "reward_change_min": -0.8391226977109909, "reward_change_std": 0.3295242711901665, "reward_std": 0.8745506331324577, "rewards/cosine_scaled_reward": -0.02569146826863289, "rewards/format_reward": 0.6875000111758709, "step": 461 }, { "advantage_max": 1.3718049079179764, "advantage_mean": 3.725290520506519e-09, "advantage_min": -0.573211383074522, "advantage_std": 0.7311573587357998, "completion_length": 2979.958396911621, "epoch": 0.528, "grad_norm": 0.5489564538002014, "kl": 0.288818359375, "lambda_div_used": 0.6, "learning_rate": 1.1574257748745986e-07, "loss": 0.0419, "reward": -0.05020551884081215, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05020551884081215, "reward_after_std": 0.7311573512852192, "reward_before_mean": 0.31298802606761456, "reward_before_std": 0.6865883357822895, "reward_change_max": 0.00015259534120559692, "reward_change_mean": -0.36319353617727757, "reward_change_min": -0.7135965973138809, "reward_change_std": 0.2655666396021843, "reward_std": 0.7311573661863804, "rewards/cosine_scaled_reward": -0.19767267780844122, "rewards/format_reward": 0.708333345130086, "step": 462 }, { "advantage_max": 1.2812325581908226, "advantage_mean": 3.1044081749698194e-09, "advantage_min": -0.7424813508987427, "advantage_std": 0.7148092705756426, "completion_length": 3094.0208740234375, "epoch": 0.5291428571428571, "grad_norm": 0.6410408020019531, "kl": 0.373321533203125, "lambda_div_used": 0.6, "learning_rate": 1.1492947512799328e-07, "loss": 0.0467, "reward": 0.0862201638519764, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0862201638519764, "reward_after_std": 0.7148092705756426, "reward_before_mean": 0.5360564319416881, "reward_before_std": 0.6975500099360943, "reward_change_max": 0.0016349032521247864, "reward_change_mean": -0.44983627228066325, "reward_change_min": -0.7784820459783077, "reward_change_std": 0.31272281985729933, "reward_std": 0.7148093041032553, "rewards/cosine_scaled_reward": -0.04447178915143013, "rewards/format_reward": 0.6250000093132257, "step": 463 }, { "advantage_max": 1.2580845430493355, "advantage_mean": -9.313227411489322e-10, "advantage_min": -0.46518684923648834, "advantage_std": 0.6540076658129692, "completion_length": 2200.0000228881836, "epoch": 0.5302857142857142, "grad_norm": 0.8608336448669434, "kl": 0.327178955078125, "lambda_div_used": 0.6, "learning_rate": 1.1413757749211602e-07, "loss": 0.0051, "reward": 0.27756172651425004, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.27756172651425004, "reward_after_std": 0.654007650911808, "reward_before_mean": 0.832300890237093, "reward_before_std": 0.48632822558283806, "reward_change_max": 0.0, "reward_change_mean": -0.5547391846776009, "reward_change_min": -0.855882078409195, "reward_change_std": 0.31370352767407894, "reward_std": 0.6540076583623886, "rewards/cosine_scaled_reward": -0.0005162106826901436, "rewards/format_reward": 0.8333333507180214, "step": 464 }, { "advantage_max": 1.569391205906868, "advantage_mean": -1.2107193803068128e-08, "advantage_min": -0.8988099247217178, "advantage_std": 0.881404098123312, "completion_length": 3097.979248046875, "epoch": 0.5314285714285715, "grad_norm": 0.6101652383804321, "kl": 0.368743896484375, "lambda_div_used": 0.6, "learning_rate": 1.1336692317580158e-07, "loss": 0.023, "reward": 0.03458056226372719, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.03458056226372719, "reward_after_std": 0.8814041279256344, "reward_before_mean": 0.42546552093699574, "reward_before_std": 0.9255600795149803, "reward_change_max": 0.0013052746653556824, "reward_change_mean": -0.39088496938347816, "reward_change_min": -0.7886900715529919, "reward_change_std": 0.3330534026026726, "reward_std": 0.8814041391015053, "rewards/cosine_scaled_reward": -0.08935058303177357, "rewards/format_reward": 0.604166679084301, "step": 465 }, { "advantage_max": 1.9037173837423325, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.8234491348266602, "advantage_std": 1.006118580698967, "completion_length": 3100.5833892822266, "epoch": 0.5325714285714286, "grad_norm": 1.2542622089385986, "kl": 0.33632659912109375, "lambda_div_used": 0.6, "learning_rate": 1.1261754973965422e-07, "loss": 0.0639, "reward": 0.28048246819525957, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.28048246819525957, "reward_after_std": 1.006118580698967, "reward_before_mean": 0.7699800729751587, "reward_before_std": 0.9303984567523003, "reward_change_max": 0.0, "reward_change_mean": -0.4894975833594799, "reward_change_min": -0.9491491317749023, "reward_change_std": 0.35173042491078377, "reward_std": 1.006118580698967, "rewards/cosine_scaled_reward": 0.10374002461321652, "rewards/format_reward": 0.562500013038516, "step": 466 }, { "advantage_max": 1.1005533039569855, "advantage_mean": 1.0865429111994729e-09, "advantage_min": -0.6562899611890316, "advantage_std": 0.6144971549510956, "completion_length": 3035.5833892822266, "epoch": 0.5337142857142857, "grad_norm": 0.6637769341468811, "kl": 0.325714111328125, "lambda_div_used": 0.6, "learning_rate": 1.1188949370707787e-07, "loss": 0.0167, "reward": 0.05374788446351886, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05374788446351886, "reward_after_std": 0.6144971549510956, "reward_before_mean": 0.5047153122723103, "reward_before_std": 0.572929710149765, "reward_change_max": 0.0007006600499153137, "reward_change_mean": -0.4509674310684204, "reward_change_min": -0.7618739381432533, "reward_change_std": 0.29695762135088444, "reward_std": 0.6144971884787083, "rewards/cosine_scaled_reward": -0.14347568154335022, "rewards/format_reward": 0.7916666828095913, "step": 467 }, { "advantage_max": 1.8903379365801811, "advantage_mean": -2.0178656356950597e-08, "advantage_min": -0.9778851941227913, "advantage_std": 1.0337693467736244, "completion_length": 3057.291717529297, "epoch": 0.5348571428571428, "grad_norm": 1.200481653213501, "kl": 0.3438720703125, "lambda_div_used": 0.6, "learning_rate": 1.1118279056249653e-07, "loss": 0.0748, "reward": 0.42284363880753517, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.42284363880753517, "reward_after_std": 1.0337693840265274, "reward_before_mean": 0.9913366120308638, "reward_before_std": 1.0011842101812363, "reward_change_max": 0.0007850900292396545, "reward_change_mean": -0.5684929899871349, "reward_change_min": -1.0620117411017418, "reward_change_std": 0.40233112312853336, "reward_std": 1.033769391477108, "rewards/cosine_scaled_reward": 0.08941829390823841, "rewards/format_reward": 0.8125000186264515, "step": 468 }, { "advantage_max": 1.535146877169609, "advantage_mean": -1.4901161637936866e-08, "advantage_min": -0.8611436411738396, "advantage_std": 0.8365602642297745, "completion_length": 2808.9583892822266, "epoch": 0.536, "grad_norm": 0.8671948313713074, "kl": 0.34912109375, "lambda_div_used": 0.6, "learning_rate": 1.1049747474962444e-07, "loss": 0.0171, "reward": 0.35874230810441077, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.35874230810441077, "reward_after_std": 0.8365602716803551, "reward_before_mean": 0.9347008776385337, "reward_before_std": 0.7686922624707222, "reward_change_max": 0.0, "reward_change_mean": -0.5759586170315742, "reward_change_min": -0.9069937616586685, "reward_change_std": 0.3752599321305752, "reward_std": 0.8365602791309357, "rewards/cosine_scaled_reward": 0.05068379477597773, "rewards/format_reward": 0.8333333507180214, "step": 469 }, { "advantage_max": 1.4765567556023598, "advantage_mean": -1.4280279347911318e-08, "advantage_min": -0.8885560110211372, "advantage_std": 0.8526263982057571, "completion_length": 3234.416748046875, "epoch": 0.5371428571428571, "grad_norm": 0.4323049783706665, "kl": 0.38311767578125, "lambda_div_used": 0.6, "learning_rate": 1.0983357966978745e-07, "loss": 0.0266, "reward": 0.08719077915884554, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08719077915884554, "reward_after_std": 0.8526264131069183, "reward_before_mean": 0.5135673470795155, "reward_before_std": 0.9073382653295994, "reward_change_max": 0.0005387812852859497, "reward_change_mean": -0.42637658305466175, "reward_change_min": -0.8534897528588772, "reward_change_std": 0.3533334955573082, "reward_std": 0.8526264280080795, "rewards/cosine_scaled_reward": -0.05571634043008089, "rewards/format_reward": 0.6250000204890966, "step": 470 }, { "advantage_max": 1.6796835511922836, "advantage_mean": -6.208817349140361e-09, "advantage_min": -0.8010094054043293, "advantage_std": 0.893382478505373, "completion_length": 3111.104248046875, "epoch": 0.5382857142857143, "grad_norm": 0.856365978717804, "kl": 0.4088134765625, "lambda_div_used": 0.6, "learning_rate": 1.0919113768029517e-07, "loss": 0.0227, "reward": 0.2544027629774064, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2544027629774064, "reward_after_std": 0.8933824747800827, "reward_before_mean": 0.750761479139328, "reward_before_std": 0.8166644982993603, "reward_change_max": 0.00011660903692245483, "reward_change_mean": -0.4963587149977684, "reward_change_min": -0.8781631253659725, "reward_change_std": 0.3329938519746065, "reward_std": 0.8933825083076954, "rewards/cosine_scaled_reward": -0.020452602300792933, "rewards/format_reward": 0.7916666753590107, "step": 471 }, { "advantage_max": 1.6807861477136612, "advantage_mean": -4.346172144398253e-09, "advantage_min": -0.7541506327688694, "advantage_std": 0.896699994802475, "completion_length": 2950.666748046875, "epoch": 0.5394285714285715, "grad_norm": 0.8270246386528015, "kl": 0.33209228515625, "lambda_div_used": 0.6, "learning_rate": 1.0857018009286381e-07, "loss": 0.037, "reward": 0.09066922077909112, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09066922077909112, "reward_after_std": 0.896699994802475, "reward_before_mean": 0.4971227226778865, "reward_before_std": 0.8639278411865234, "reward_change_max": 0.00027048587799072266, "reward_change_mean": -0.40645348466932774, "reward_change_min": -0.7924029342830181, "reward_change_std": 0.298714161850512, "reward_std": 0.8967000283300877, "rewards/cosine_scaled_reward": -0.11602198891341686, "rewards/format_reward": 0.7291666772216558, "step": 472 }, { "advantage_max": 1.2666622251272202, "advantage_mean": -3.1044085080367267e-09, "advantage_min": -0.7336488291621208, "advantage_std": 0.7046432234346867, "completion_length": 2937.6458892822266, "epoch": 0.5405714285714286, "grad_norm": 0.6345215439796448, "kl": 0.26678466796875, "lambda_div_used": 0.6, "learning_rate": 1.0797073717209013e-07, "loss": -0.0034, "reward": 0.24014693347271532, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24014693347271532, "reward_after_std": 0.7046432085335255, "reward_before_mean": 0.7769346954301, "reward_before_std": 0.6478455290198326, "reward_change_max": 0.00018671154975891113, "reward_change_mean": -0.5367877427488565, "reward_change_min": -0.9317773431539536, "reward_change_std": 0.3534272387623787, "reward_std": 0.7046432197093964, "rewards/cosine_scaled_reward": 0.013467340730130672, "rewards/format_reward": 0.7500000167638063, "step": 473 }, { "advantage_max": 1.8945399820804596, "advantage_mean": 1.2417634476236117e-08, "advantage_min": -0.7374499104917049, "advantage_std": 0.9751530736684799, "completion_length": 2611.479217529297, "epoch": 0.5417142857142857, "grad_norm": 0.5259870886802673, "kl": 0.29508209228515625, "lambda_div_used": 0.6, "learning_rate": 1.0739283813397639e-07, "loss": 0.0486, "reward": 0.3512391453841701, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3512391453841701, "reward_after_std": 0.9751530811190605, "reward_before_mean": 0.8826748724095523, "reward_before_std": 0.8347831852734089, "reward_change_max": 0.0, "reward_change_mean": -0.5314356926828623, "reward_change_min": -0.868693646043539, "reward_change_std": 0.3366575762629509, "reward_std": 0.9751530885696411, "rewards/cosine_scaled_reward": 0.07675410318188369, "rewards/format_reward": 0.7291666772216558, "step": 474 }, { "advantage_max": 1.640965461730957, "advantage_mean": 1.986821529520455e-08, "advantage_min": -0.6341977827250957, "advantage_std": 0.8619020059704781, "completion_length": 2496.4167098999023, "epoch": 0.5428571428571428, "grad_norm": 0.5010621547698975, "kl": 0.47027587890625, "lambda_div_used": 0.6, "learning_rate": 1.068365111445064e-07, "loss": 0.0417, "reward": 0.2569463880499825, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2569463880499825, "reward_after_std": 0.8619019910693169, "reward_before_mean": 0.7555128782987595, "reward_before_std": 0.7467895969748497, "reward_change_max": 0.0006568357348442078, "reward_change_mean": -0.49856647476553917, "reward_change_min": -0.8757809400558472, "reward_change_std": 0.3343859352171421, "reward_std": 0.8619020283222198, "rewards/cosine_scaled_reward": 0.0965064475312829, "rewards/format_reward": 0.5625000018626451, "step": 475 }, { "advantage_max": 1.956729143857956, "advantage_mean": -9.934107758624577e-09, "advantage_min": -0.9143726825714111, "advantage_std": 1.0552341118454933, "completion_length": 3005.5625915527344, "epoch": 0.544, "grad_norm": 2.4432625770568848, "kl": 0.372802734375, "lambda_div_used": 0.6, "learning_rate": 1.063017833182728e-07, "loss": 0.0981, "reward": 0.24209812004119158, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24209812004119158, "reward_after_std": 1.0552341118454933, "reward_before_mean": 0.7057913790922612, "reward_before_std": 1.0380639657378197, "reward_change_max": 0.0, "reward_change_mean": -0.46369326300919056, "reward_change_min": -0.9631114266812801, "reward_change_std": 0.36324442923069, "reward_std": 1.055234156548977, "rewards/cosine_scaled_reward": -0.032520990062039346, "rewards/format_reward": 0.7708333507180214, "step": 476 }, { "advantage_max": 1.6627508848905563, "advantage_mean": 5.587935947293232e-09, "advantage_min": -0.7512878328561783, "advantage_std": 0.8839316442608833, "completion_length": 2252.5000915527344, "epoch": 0.5451428571428572, "grad_norm": 1.535606861114502, "kl": 0.16825103759765625, "lambda_div_used": 0.6, "learning_rate": 1.0578868071715544e-07, "loss": 0.0511, "reward": 0.6627893559634686, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.6627893559634686, "reward_after_std": 0.8839316368103027, "reward_before_mean": 1.3912963923066854, "reward_before_std": 0.7169366031885147, "reward_change_max": 0.0, "reward_change_mean": -0.7285070493817329, "reward_change_min": -1.05146986246109, "reward_change_std": 0.4186026845127344, "reward_std": 0.8839316628873348, "rewards/cosine_scaled_reward": 0.22689818311482668, "rewards/format_reward": 0.9375000074505806, "step": 477 }, { "advantage_max": 1.5479968041181564, "advantage_mean": -9.934107314535368e-09, "advantage_min": -0.6975952759385109, "advantage_std": 0.8103221245110035, "completion_length": 2727.8125762939453, "epoch": 0.5462857142857143, "grad_norm": 0.9718415141105652, "kl": 0.2834930419921875, "lambda_div_used": 0.6, "learning_rate": 1.0529722834905125e-07, "loss": 0.0349, "reward": 0.01596967503428459, "reward_advantage_correlation": 0.9999999999999997, "reward_after_mean": 0.01596967503428459, "reward_after_std": 0.8103221245110035, "reward_before_mean": 0.39516052044928074, "reward_before_std": 0.7432158552110195, "reward_change_max": 0.0012557506561279297, "reward_change_mean": -0.37919086311012506, "reward_change_min": -0.6728931628167629, "reward_change_std": 0.2619459554553032, "reward_std": 0.8103221319615841, "rewards/cosine_scaled_reward": -0.0940864197909832, "rewards/format_reward": 0.5833333525806665, "step": 478 }, { "advantage_max": 1.5282156020402908, "advantage_mean": -7.45058065243498e-09, "advantage_min": -0.6081922184675932, "advantage_std": 0.7922434285283089, "completion_length": 2916.1250610351562, "epoch": 0.5474285714285714, "grad_norm": 0.396575003862381, "kl": 0.3342437744140625, "lambda_div_used": 0.6, "learning_rate": 1.0482745016665526e-07, "loss": 0.0481, "reward": 0.1486353098298423, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1486353098298423, "reward_after_std": 0.7922434583306313, "reward_before_mean": 0.6048104707151651, "reward_before_std": 0.6755081471055746, "reward_change_max": 0.0, "reward_change_mean": -0.45617516711354256, "reward_change_min": -0.7893848493695259, "reward_change_std": 0.2866050563752651, "reward_std": 0.7922434769570827, "rewards/cosine_scaled_reward": -0.1350947736063972, "rewards/format_reward": 0.8750000149011612, "step": 479 }, { "advantage_max": 1.4838635623455048, "advantage_mean": -1.0554989604560916e-08, "advantage_min": -0.6728080175817013, "advantage_std": 0.8044894188642502, "completion_length": 2724.4792404174805, "epoch": 0.5485714285714286, "grad_norm": 0.9163870215415955, "kl": 0.307281494140625, "lambda_div_used": 0.6, "learning_rate": 1.0437936906629334e-07, "loss": 0.0065, "reward": 0.14659792685415596, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14659792685415596, "reward_after_std": 0.8044894188642502, "reward_before_mean": 0.6072377488017082, "reward_before_std": 0.7544484827667475, "reward_change_max": 0.0, "reward_change_mean": -0.46063981391489506, "reward_change_min": -0.8319177739322186, "reward_change_std": 0.3214505556970835, "reward_std": 0.8044894523918629, "rewards/cosine_scaled_reward": -0.04013113956898451, "rewards/format_reward": 0.6875000055879354, "step": 480 }, { "advantage_max": 1.359544724225998, "advantage_mean": 1.2728075482471013e-08, "advantage_min": -0.5560879930853844, "advantage_std": 0.7110361345112324, "completion_length": 3155.666778564453, "epoch": 0.5497142857142857, "grad_norm": 0.54989093542099, "kl": 0.33892822265625, "lambda_div_used": 0.6, "learning_rate": 1.0395300688680625e-07, "loss": 0.0236, "reward": 0.14918394200503826, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14918394200503826, "reward_after_std": 0.7110361270606518, "reward_before_mean": 0.6284264158457518, "reward_before_std": 0.6068124901503325, "reward_change_max": 0.0010639280080795288, "reward_change_mean": -0.4792424812912941, "reward_change_min": -0.8376931995153427, "reward_change_std": 0.2971576862037182, "reward_std": 0.7110361494123936, "rewards/cosine_scaled_reward": -0.11287012998946011, "rewards/format_reward": 0.8541666828095913, "step": 481 }, { "advantage_max": 1.7242258936166763, "advantage_mean": -2.6697914601303552e-08, "advantage_min": -0.9280278235673904, "advantage_std": 0.9408847987651825, "completion_length": 2702.333351135254, "epoch": 0.5508571428571428, "grad_norm": 0.807201623916626, "kl": 0.326751708984375, "lambda_div_used": 0.6, "learning_rate": 1.0354838440848501e-07, "loss": 0.0343, "reward": 0.3970525776967406, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3970525776967406, "reward_after_std": 0.9408847987651825, "reward_before_mean": 0.9709220081567764, "reward_before_std": 0.8880007974803448, "reward_change_max": 0.0015290752053260803, "reward_change_mean": -0.5738694509491324, "reward_change_min": -0.9650955460965633, "reward_change_std": 0.3934989022091031, "reward_std": 0.9408848136663437, "rewards/cosine_scaled_reward": 0.13129432266578078, "rewards/format_reward": 0.7083333376795053, "step": 482 }, { "advantage_max": 1.142264649271965, "advantage_mean": -1.0554989049449404e-08, "advantage_min": -0.6997229307889938, "advantage_std": 0.639618307352066, "completion_length": 3028.104232788086, "epoch": 0.552, "grad_norm": 0.48145022988319397, "kl": 0.388519287109375, "lambda_div_used": 0.6, "learning_rate": 1.0316552135205837e-07, "loss": 0.0323, "reward": 0.07725562807172537, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07725562807172537, "reward_after_std": 0.6396183036267757, "reward_before_mean": 0.53841517935507, "reward_before_std": 0.6105475910007954, "reward_change_max": 0.0, "reward_change_mean": -0.4611595496535301, "reward_change_min": -0.7524649910628796, "reward_change_std": 0.3050990607589483, "reward_std": 0.639618307352066, "rewards/cosine_scaled_reward": -0.10579242091625929, "rewards/format_reward": 0.7500000111758709, "step": 483 }, { "advantage_max": 1.4299253299832344, "advantage_mean": -3.725290520506519e-09, "advantage_min": -0.8638685494661331, "advantage_std": 0.8112721741199493, "completion_length": 2579.7708892822266, "epoch": 0.5531428571428572, "grad_norm": 1.489706039428711, "kl": 0.2846221923828125, "lambda_div_used": 0.6, "learning_rate": 1.0280443637773163e-07, "loss": 0.0702, "reward": 0.3688160046003759, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3688160046003759, "reward_after_std": 0.8112721666693687, "reward_before_mean": 0.9602692160988227, "reward_before_std": 0.7896840088069439, "reward_change_max": 0.0, "reward_change_mean": -0.5914532169699669, "reward_change_min": -1.0330743491649628, "reward_change_std": 0.4153981562703848, "reward_std": 0.8112721815705299, "rewards/cosine_scaled_reward": 0.09471793798729777, "rewards/format_reward": 0.7708333469927311, "step": 484 }, { "advantage_max": 1.6329149454832077, "advantage_mean": -1.8005569923928988e-08, "advantage_min": -0.8053278736770153, "advantage_std": 0.8814475685358047, "completion_length": 2728.9375610351562, "epoch": 0.5542857142857143, "grad_norm": 0.6401396989822388, "kl": 0.4595947265625, "lambda_div_used": 0.6, "learning_rate": 1.0246514708427701e-07, "loss": 0.0341, "reward": 0.18104042625054717, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18104042625054717, "reward_after_std": 0.8814475610852242, "reward_before_mean": 0.6409261487424374, "reward_before_std": 0.8435741513967514, "reward_change_max": 0.0005346983671188354, "reward_change_mean": -0.45988572016358376, "reward_change_min": -0.8481529578566551, "reward_change_std": 0.3186565674841404, "reward_std": 0.8814475685358047, "rewards/cosine_scaled_reward": -0.07537028007209301, "rewards/format_reward": 0.7916666939854622, "step": 485 }, { "advantage_max": 1.2459972277283669, "advantage_mean": -1.4280279680978225e-08, "advantage_min": -0.5129361264407635, "advantage_std": 0.6458823718130589, "completion_length": 2616.5833740234375, "epoch": 0.5554285714285714, "grad_norm": 0.7087405323982239, "kl": 0.3177337646484375, "lambda_div_used": 0.6, "learning_rate": 1.0214767000817596e-07, "loss": 0.0197, "reward": 0.12188614474143833, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12188614474143833, "reward_after_std": 0.64588238671422, "reward_before_mean": 0.5966749314684421, "reward_before_std": 0.502772644162178, "reward_change_max": 0.0, "reward_change_mean": -0.47478877753019333, "reward_change_min": -0.7171571180224419, "reward_change_std": 0.27346576750278473, "reward_std": 0.6458823904395103, "rewards/cosine_scaled_reward": -0.1287458804436028, "rewards/format_reward": 0.8541666865348816, "step": 486 }, { "advantage_max": 1.5544669702649117, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.5799307525157928, "advantage_std": 0.7964937537908554, "completion_length": 2185.62504196167, "epoch": 0.5565714285714286, "grad_norm": 0.7838549613952637, "kl": 0.2025604248046875, "lambda_div_used": 0.6, "learning_rate": 1.0185202062281336e-07, "loss": -0.007, "reward": 0.4715829244814813, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4715829244814813, "reward_after_std": 0.7964937686920166, "reward_before_mean": 1.106121625751257, "reward_before_std": 0.5940053351223469, "reward_change_max": 0.0, "reward_change_mean": -0.6345386989414692, "reward_change_min": -0.960016280412674, "reward_change_std": 0.3513956777751446, "reward_std": 0.7964937686920166, "rewards/cosine_scaled_reward": 0.07389411079930142, "rewards/format_reward": 0.9583333432674408, "step": 487 }, { "advantage_max": 1.1480357646942139, "advantage_mean": 8.692344288796505e-09, "advantage_min": -0.5942578949034214, "advantage_std": 0.636834591627121, "completion_length": 2252.604179382324, "epoch": 0.5577142857142857, "grad_norm": 0.34447166323661804, "kl": 0.24651336669921875, "lambda_div_used": 0.6, "learning_rate": 1.0157821333772304e-07, "loss": 0.0124, "reward": 0.09369122982025146, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.09369122982025146, "reward_after_std": 0.6368345841765404, "reward_before_mean": 0.5591697334311903, "reward_before_std": 0.591731121763587, "reward_change_max": 0.0017108246684074402, "reward_change_mean": -0.46547851897776127, "reward_change_min": -0.8035316653549671, "reward_change_std": 0.3129301369190216, "reward_std": 0.636834591627121, "rewards/cosine_scaled_reward": -0.0745818018913269, "rewards/format_reward": 0.7083333469927311, "step": 488 }, { "advantage_max": 1.1497306674718857, "advantage_mean": 3.4148495420271985e-09, "advantage_min": -0.536426167935133, "advantage_std": 0.6116931214928627, "completion_length": 3402.8541870117188, "epoch": 0.5588571428571428, "grad_norm": 1.1072264909744263, "kl": 0.534423828125, "lambda_div_used": 0.6, "learning_rate": 1.013262614978859e-07, "loss": 0.0316, "reward": -0.22755743563175201, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.22755743563175201, "reward_after_std": 0.6116931177675724, "reward_before_mean": 0.061076946556568146, "reward_before_std": 0.5797216258943081, "reward_change_max": 0.00025550276041030884, "reward_change_mean": -0.2886343817226589, "reward_change_min": -0.5480538904666901, "reward_change_std": 0.20898331236094236, "reward_std": 0.6116931326687336, "rewards/cosine_scaled_reward": -0.24029485508799553, "rewards/format_reward": 0.5416666828095913, "step": 489 }, { "advantage_max": 1.19098761677742, "advantage_mean": -9.3132264122886e-09, "advantage_min": -0.5116630420088768, "advantage_std": 0.6245383583009243, "completion_length": 2225.3334045410156, "epoch": 0.56, "grad_norm": 0.4310756027698517, "kl": 0.24271392822265625, "lambda_div_used": 0.6, "learning_rate": 1.0109617738307911e-07, "loss": 0.0228, "reward": 0.11366786062717438, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11366786062717438, "reward_after_std": 0.624538354575634, "reward_before_mean": 0.5907516656443477, "reward_before_std": 0.4991147108376026, "reward_change_max": 0.0008979067206382751, "reward_change_mean": -0.47708383947610855, "reward_change_min": -0.7438569702208042, "reward_change_std": 0.27903275191783905, "reward_std": 0.6245383620262146, "rewards/cosine_scaled_reward": -0.1421241695061326, "rewards/format_reward": 0.8750000111758709, "step": 490 }, { "advantage_max": 1.4718172997236252, "advantage_mean": -1.1796753074388988e-08, "advantage_min": -0.6949670314788818, "advantage_std": 0.7889797687530518, "completion_length": 2863.437545776367, "epoch": 0.5611428571428572, "grad_norm": 0.5122021436691284, "kl": 0.44110107421875, "lambda_div_used": 0.6, "learning_rate": 1.0088797220727779e-07, "loss": 0.0421, "reward": 0.45797410421073437, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.45797410421073437, "reward_after_std": 0.7889797762036324, "reward_before_mean": 1.0883095804601908, "reward_before_std": 0.6660438068211079, "reward_change_max": 0.0016724318265914917, "reward_change_mean": -0.6303354352712631, "reward_change_min": -1.0100024230778217, "reward_change_std": 0.382383581250906, "reward_std": 0.7889798246324062, "rewards/cosine_scaled_reward": 0.1899880999699235, "rewards/format_reward": 0.7083333395421505, "step": 491 }, { "advantage_max": 1.319904811680317, "advantage_mean": -1.80055704790405e-08, "advantage_min": -0.8290270902216434, "advantage_std": 0.7442688755691051, "completion_length": 2714.7917404174805, "epoch": 0.5622857142857143, "grad_norm": 0.6903409361839294, "kl": 0.3553466796875, "lambda_div_used": 0.6, "learning_rate": 1.0070165611810855e-07, "loss": 0.0379, "reward": 0.3523098239675164, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3523098239675164, "reward_after_std": 0.7442688718438148, "reward_before_mean": 0.9426992423832417, "reward_before_std": 0.6989971324801445, "reward_change_max": 0.0, "reward_change_mean": -0.5903894305229187, "reward_change_min": -0.9812662154436111, "reward_change_std": 0.38451446406543255, "reward_std": 0.7442688792943954, "rewards/cosine_scaled_reward": 0.04426628723740578, "rewards/format_reward": 0.854166679084301, "step": 492 }, { "advantage_max": 1.611944667994976, "advantage_mean": -4.221995775210985e-08, "advantage_min": -0.8087064102292061, "advantage_std": 0.8931445516645908, "completion_length": 2369.81258392334, "epoch": 0.5634285714285714, "grad_norm": 0.427365779876709, "kl": 0.24420928955078125, "lambda_div_used": 0.6, "learning_rate": 1.005372381963547e-07, "loss": 0.0232, "reward": 0.5566324144601822, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.5566324144601822, "reward_after_std": 0.8931445516645908, "reward_before_mean": 1.229935996234417, "reward_before_std": 0.8107591010630131, "reward_change_max": 0.0006113573908805847, "reward_change_mean": -0.6733036190271378, "reward_change_min": -1.150217853486538, "reward_change_std": 0.44052213057875633, "reward_std": 0.8931446000933647, "rewards/cosine_scaled_reward": 0.14621799066662788, "rewards/format_reward": 0.9375000074505806, "step": 493 }, { "advantage_max": 2.0662256479263306, "advantage_mean": -1.940255400789681e-08, "advantage_min": -1.025919608771801, "advantage_std": 1.1447409093379974, "completion_length": 2289.3125610351562, "epoch": 0.5645714285714286, "grad_norm": 1.5831674337387085, "kl": 0.2780609130859375, "lambda_div_used": 0.6, "learning_rate": 1.0039472645551372e-07, "loss": 0.0502, "reward": 0.47046585264615715, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.47046585264615715, "reward_after_std": 1.1447409093379974, "reward_before_mean": 1.0404850542545319, "reward_before_std": 1.1501704677939415, "reward_change_max": 0.000872880220413208, "reward_change_mean": -0.5700191967189312, "reward_change_min": -1.2041919827461243, "reward_change_std": 0.44489174522459507, "reward_std": 1.1447409093379974, "rewards/cosine_scaled_reward": 0.08274251967668533, "rewards/format_reward": 0.8750000223517418, "step": 494 }, { "advantage_max": 1.207513116300106, "advantage_mean": -2.017865596837254e-08, "advantage_min": -0.6101922206580639, "advantage_std": 0.6588433645665646, "completion_length": 2922.6458740234375, "epoch": 0.5657142857142857, "grad_norm": 0.5391934514045715, "kl": 0.37164306640625, "lambda_div_used": 0.6, "learning_rate": 1.002741278414069e-07, "loss": 0.0271, "reward": 0.2596977346111089, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2596977346111089, "reward_after_std": 0.6588433682918549, "reward_before_mean": 0.8155031725764275, "reward_before_std": 0.55685312487185, "reward_change_max": 0.0, "reward_change_mean": -0.5558054614812136, "reward_change_min": -0.8851585574448109, "reward_change_std": 0.3384298738092184, "reward_std": 0.6588433757424355, "rewards/cosine_scaled_reward": 0.0015015807002782822, "rewards/format_reward": 0.8125000111758709, "step": 495 }, { "advantage_max": 1.452590487897396, "advantage_mean": -4.8428774435116395e-08, "advantage_min": -0.6632073484361172, "advantage_std": 0.7584183663129807, "completion_length": 2523.5833587646484, "epoch": 0.5668571428571428, "grad_norm": 0.47810283303260803, "kl": 0.3327484130859375, "lambda_div_used": 0.6, "learning_rate": 1.0017544823184055e-07, "loss": 0.0208, "reward": 0.4266379442997277, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4266379442997277, "reward_after_std": 0.7584183737635612, "reward_before_mean": 1.0474421493709087, "reward_before_std": 0.599539153277874, "reward_change_max": 3.6485493183135986e-05, "reward_change_mean": -0.6208042334765196, "reward_change_min": -0.9354967325925827, "reward_change_std": 0.35683672688901424, "reward_std": 0.7584183923900127, "rewards/cosine_scaled_reward": 0.13830439560115337, "rewards/format_reward": 0.7708333414047956, "step": 496 }, { "advantage_max": 1.416138507425785, "advantage_mean": -8.6923440667519e-09, "advantage_min": -0.8415560573339462, "advantage_std": 0.7852711267769337, "completion_length": 2443.604232788086, "epoch": 0.568, "grad_norm": 0.6396067142486572, "kl": 0.198028564453125, "lambda_div_used": 0.6, "learning_rate": 1.0009869243631952e-07, "loss": 0.0316, "reward": 0.5844808593392372, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.5844808593392372, "reward_after_std": 0.7852711230516434, "reward_before_mean": 1.298376940190792, "reward_before_std": 0.6900318209081888, "reward_change_max": 0.0, "reward_change_mean": -0.7138960808515549, "reward_change_min": -1.110473420470953, "reward_change_std": 0.4316666442900896, "reward_std": 0.7852711528539658, "rewards/cosine_scaled_reward": 0.19085513520985842, "rewards/format_reward": 0.9166666716337204, "step": 497 }, { "advantage_max": 1.8146474957466125, "advantage_mean": -2.1109979486677588e-08, "advantage_min": -0.8280821889638901, "advantage_std": 0.962591864168644, "completion_length": 2807.1459350585938, "epoch": 0.5691428571428572, "grad_norm": 0.9842413663864136, "kl": 0.35626220703125, "lambda_div_used": 0.6, "learning_rate": 1.000438641958131e-07, "loss": 0.0414, "reward": 0.35881377733312547, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.35881377733312547, "reward_after_std": 0.9625918716192245, "reward_before_mean": 0.8991924710571766, "reward_before_std": 0.8837512731552124, "reward_change_max": 0.0004179328680038452, "reward_change_mean": -0.5403786823153496, "reward_change_min": -0.9675283432006836, "reward_change_std": 0.35297749750316143, "reward_std": 0.9625919163227081, "rewards/cosine_scaled_reward": 0.012096216436475515, "rewards/format_reward": 0.8750000223517418, "step": 498 }, { "advantage_max": 1.6346061080694199, "advantage_mean": -1.6142925329809543e-08, "advantage_min": -0.9418043605983257, "advantage_std": 0.9355646707117558, "completion_length": 2936.437545776367, "epoch": 0.5702857142857143, "grad_norm": 0.9127795100212097, "kl": 0.388641357421875, "lambda_div_used": 0.6, "learning_rate": 1.0001096618257236e-07, "loss": 0.0482, "reward": 0.32707899808883667, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.32707899808883667, "reward_after_std": 0.935564685612917, "reward_before_mean": 0.8720442028716207, "reward_before_std": 0.963412918150425, "reward_change_max": 0.0, "reward_change_mean": -0.5449651964008808, "reward_change_min": -1.0156399756669998, "reward_change_std": 0.4170589130371809, "reward_std": 0.9355646967887878, "rewards/cosine_scaled_reward": 0.05060542202409124, "rewards/format_reward": 0.7708333469927311, "step": 499 }, { "advantage_max": 1.487747348845005, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.7583122663199902, "advantage_std": 0.8185562938451767, "completion_length": 3081.4375610351562, "epoch": 0.5714285714285714, "grad_norm": 0.8269075155258179, "kl": 0.37713623046875, "lambda_div_used": 0.6, "learning_rate": 1e-07, "loss": 0.0488, "reward": 0.1662901910021901, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1662901910021901, "reward_after_std": 0.8185563161969185, "reward_before_mean": 0.635827727150172, "reward_before_std": 0.7835097871720791, "reward_change_max": 0.0, "reward_change_mean": -0.46953752264380455, "reward_change_min": -0.888320729136467, "reward_change_std": 0.33614025823771954, "reward_std": 0.81855633482337, "rewards/cosine_scaled_reward": -0.025836152024567127, "rewards/format_reward": 0.6875000204890966, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.0057369744749739765, "train_runtime": 18708.5051, "train_samples_per_second": 1.283, "train_steps_per_second": 0.027 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }