diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5714285714285714, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "advantage_max": 1.7736882269382477, + "advantage_mean": 4.967053990334591e-09, + "advantage_min": -1.0247227177023888, + "advantage_std": 0.9998322650790215, + "completion_length": 2571.2083587646484, + "epoch": 0.001142857142857143, + "grad_norm": 0.19998183846473694, + "kl": 0.0, + "lambda_div_used": 0.6, + "learning_rate": 2e-08, + "loss": -0.0, + "reward": 0.06657012924551964, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06657012924551964, + "reward_after_std": 0.805392861366272, + "reward_before_mean": 0.4897647276520729, + "reward_before_std": 0.8290339298546314, + "reward_change_max": 0.0005614385008811951, + "reward_change_mean": -0.4231945872306824, + "reward_change_min": -0.8292400389909744, + "reward_change_std": 0.33647667057812214, + "reward_std": 0.8053928762674332, + "rewards/cosine_scaled_reward": -0.015534311532974243, + "rewards/format_reward": 0.5208333488553762, + "step": 1 + }, + { + "advantage_max": 1.7142265439033508, + "advantage_mean": 2.7318796780306798e-08, + "advantage_min": -1.0121877193450928, + "advantage_std": 0.9997509345412254, + "completion_length": 2804.395881652832, + "epoch": 0.002285714285714286, + "grad_norm": 0.18245820701122284, + "kl": 0.0, + "lambda_div_used": 0.6, + "learning_rate": 4e-08, + "loss": -0.0, + "reward": -0.11615866981446743, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11615866981446743, + "reward_after_std": 0.4655082933604717, + "reward_before_mean": 0.27539755403995514, + "reward_before_std": 0.42092561535537243, + "reward_change_max": 0.0013062208890914917, + "reward_change_mean": -0.39155622851103544, + "reward_change_min": -0.6376443430781364, + "reward_change_std": 0.26012564916163683, + "reward_std": 0.46550831012427807, + "rewards/cosine_scaled_reward": -0.04980122856795788, + "rewards/format_reward": 0.37500000558793545, + "step": 2 + }, + { + "advantage_max": 1.7450169622898102, + "advantage_mean": 1.3659398945264911e-08, + "advantage_min": -0.9959479197859764, + "advantage_std": 0.999689593911171, + "completion_length": 3374.3125, + "epoch": 0.0034285714285714284, + "grad_norm": 0.16747689247131348, + "kl": 4.373490810394287e-05, + "lambda_div_used": 0.6, + "learning_rate": 6e-08, + "loss": 0.0, + "reward": -0.5193910151720047, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.5193910151720047, + "reward_after_std": 0.48704322427511215, + "reward_before_mean": -0.35822661221027374, + "reward_before_std": 0.5342355072498322, + "reward_change_max": 0.0013915002346038818, + "reward_change_mean": -0.16116441413760185, + "reward_change_min": -0.43814222142100334, + "reward_change_std": 0.17781410180032253, + "reward_std": 0.4870432298630476, + "rewards/cosine_scaled_reward": -0.25202997773885727, + "rewards/format_reward": 0.14583333767950535, + "step": 3 + }, + { + "advantage_max": 1.9348038583993912, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -0.7399652823805809, + "advantage_std": 0.9998378455638885, + "completion_length": 2311.27091217041, + "epoch": 0.004571428571428572, + "grad_norm": 0.2256404608488083, + "kl": 3.388524055480957e-05, + "lambda_div_used": 0.6, + "learning_rate": 8e-08, + "loss": 0.0, + "reward": -0.05920893343864009, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.05920893343864009, + "reward_after_std": 0.8477489091455936, + "reward_before_mean": 0.2724987119436264, + "reward_before_std": 0.7898660115897655, + "reward_change_max": 0.00016905367374420166, + "reward_change_mean": -0.3317076303064823, + "reward_change_min": -0.6719550713896751, + "reward_change_std": 0.2557057347148657, + "reward_std": 0.8477489277720451, + "rewards/cosine_scaled_reward": -0.1658339835703373, + "rewards/format_reward": 0.6041666734963655, + "step": 4 + }, + { + "advantage_max": 1.8208965361118317, + "advantage_mean": -8.692344399818808e-09, + "advantage_min": -0.8359091579914093, + "advantage_std": 0.9997872859239578, + "completion_length": 3321.4375610351562, + "epoch": 0.005714285714285714, + "grad_norm": 0.23304429650306702, + "kl": 4.240870475769043e-05, + "lambda_div_used": 0.6, + "learning_rate": 1e-07, + "loss": 0.0, + "reward": -0.32904624473303556, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.32904624473303556, + "reward_after_std": 0.5964408218860626, + "reward_before_mean": -0.0841533716302365, + "reward_before_std": 0.6196324154734612, + "reward_change_max": 0.001553364098072052, + "reward_change_mean": -0.24489288963377476, + "reward_change_min": -0.5599503479897976, + "reward_change_std": 0.22373187262564898, + "reward_std": 0.5964408367872238, + "rewards/cosine_scaled_reward": -0.17749335523694754, + "rewards/format_reward": 0.27083334140479565, + "step": 5 + }, + { + "advantage_max": 1.8378510475158691, + "advantage_mean": 2.980232394200755e-08, + "advantage_min": -0.7627209424972534, + "advantage_std": 0.9998485594987869, + "completion_length": 3113.937545776367, + "epoch": 0.006857142857142857, + "grad_norm": 0.21384815871715546, + "kl": 4.3161213397979736e-05, + "lambda_div_used": 0.6, + "learning_rate": 1.2e-07, + "loss": 0.0, + "reward": -0.1830892115831375, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.1830892115831375, + "reward_after_std": 0.9384645372629166, + "reward_before_mean": 0.061891574412584305, + "reward_before_std": 0.9313972406089306, + "reward_change_max": 0.0028055086731910706, + "reward_change_mean": -0.24498078599572182, + "reward_change_min": -0.5818257704377174, + "reward_change_std": 0.22530303802341223, + "reward_std": 0.9384645596146584, + "rewards/cosine_scaled_reward": -0.10447087977081537, + "rewards/format_reward": 0.2708333358168602, + "step": 6 + }, + { + "advantage_max": 1.848206102848053, + "advantage_mean": 1.6142924885720333e-08, + "advantage_min": -0.8735722899436951, + "advantage_std": 0.9998387470841408, + "completion_length": 2980.854217529297, + "epoch": 0.008, + "grad_norm": 0.1532345712184906, + "kl": 2.2076070308685303e-05, + "lambda_div_used": 0.6, + "learning_rate": 1.4e-07, + "loss": 0.0, + "reward": -0.11802996881306171, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.11802996881306171, + "reward_after_std": 0.7807512283325195, + "reward_before_mean": 0.2068807277828455, + "reward_before_std": 0.7968633286654949, + "reward_change_max": 5.9023499488830566e-05, + "reward_change_mean": -0.3249106667935848, + "reward_change_min": -0.6799479350447655, + "reward_change_std": 0.2789893364533782, + "reward_std": 0.7807512618601322, + "rewards/cosine_scaled_reward": -0.13614298962056637, + "rewards/format_reward": 0.4791666828095913, + "step": 7 + }, + { + "advantage_max": 1.768284872174263, + "advantage_mean": 6.208817571184966e-09, + "advantage_min": -1.073442094027996, + "advantage_std": 0.9998049214482307, + "completion_length": 2740.937530517578, + "epoch": 0.009142857142857144, + "grad_norm": 0.1699635237455368, + "kl": 1.948140561580658e-05, + "lambda_div_used": 0.6, + "learning_rate": 1.6e-07, + "loss": 0.0, + "reward": 0.18696115911006927, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.18696115911006927, + "reward_after_std": 0.7428050264716148, + "reward_before_mean": 0.6922222673892975, + "reward_before_std": 0.7346657477319241, + "reward_change_max": 0.0013648942112922668, + "reward_change_mean": -0.5052611185237765, + "reward_change_min": -0.8160489983856678, + "reward_change_std": 0.34187921043485403, + "reward_std": 0.7428050450980663, + "rewards/cosine_scaled_reward": 0.096111124381423, + "rewards/format_reward": 0.5000000111758709, + "step": 8 + }, + { + "advantage_max": 1.7886784225702286, + "advantage_mean": 1.6142924996742636e-08, + "advantage_min": -0.8845790177583694, + "advantage_std": 0.9997691288590431, + "completion_length": 3379.9583740234375, + "epoch": 0.010285714285714285, + "grad_norm": 0.17148180305957794, + "kl": 4.947185516357422e-05, + "lambda_div_used": 0.6, + "learning_rate": 1.8e-07, + "loss": 0.0, + "reward": -0.3192979171872139, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.3192979171872139, + "reward_after_std": 0.5886461585760117, + "reward_before_mean": -0.06322952476330101, + "reward_before_std": 0.6233885241672397, + "reward_change_max": 0.0002186223864555359, + "reward_change_mean": -0.2560683786869049, + "reward_change_min": -0.5929627306759357, + "reward_change_std": 0.2404519086703658, + "reward_std": 0.5886461641639471, + "rewards/cosine_scaled_reward": -0.14619810320436954, + "rewards/format_reward": 0.2291666716337204, + "step": 9 + }, + { + "advantage_max": 1.7605973780155182, + "advantage_mean": 2.2662183574162498e-08, + "advantage_min": -1.0062192007899284, + "advantage_std": 0.9998086839914322, + "completion_length": 2689.1041679382324, + "epoch": 0.011428571428571429, + "grad_norm": 0.2057720124721527, + "kl": 2.2359192371368408e-05, + "lambda_div_used": 0.6, + "learning_rate": 2e-07, + "loss": 0.0, + "reward": -0.14903598907403648, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.14903598907403648, + "reward_after_std": 0.6891336962580681, + "reward_before_mean": 0.17712077498435974, + "reward_before_std": 0.7159077972173691, + "reward_change_max": 0.0, + "reward_change_mean": -0.32615675404667854, + "reward_change_min": -0.6679622866213322, + "reward_change_std": 0.27471973933279514, + "reward_std": 0.689133707433939, + "rewards/cosine_scaled_reward": -0.11977295717224479, + "rewards/format_reward": 0.4166666716337204, + "step": 10 + }, + { + "advantage_max": 1.8891745954751968, + "advantage_mean": 7.69893364616081e-08, + "advantage_min": -0.9090164303779602, + "advantage_std": 0.999754011631012, + "completion_length": 3391.7916870117188, + "epoch": 0.012571428571428572, + "grad_norm": 0.17078208923339844, + "kl": 2.7604401111602783e-05, + "lambda_div_used": 0.6, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0, + "reward": -0.4651691932231188, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.4651691932231188, + "reward_after_std": 0.5533009208738804, + "reward_before_mean": -0.29569249910855433, + "reward_before_std": 0.5365621503442526, + "reward_change_max": 0.0, + "reward_change_mean": -0.1694766730070114, + "reward_change_min": -0.3405461013317108, + "reward_change_std": 0.1386844478547573, + "reward_std": 0.5533009320497513, + "rewards/cosine_scaled_reward": -0.2103462554514408, + "rewards/format_reward": 0.12500000186264515, + "step": 11 + }, + { + "advantage_max": 1.815945327281952, + "advantage_mean": 2.235174295650566e-08, + "advantage_min": -0.7791703343391418, + "advantage_std": 0.9998359605669975, + "completion_length": 2334.854202270508, + "epoch": 0.013714285714285714, + "grad_norm": 0.24145646393299103, + "kl": 3.232434391975403e-05, + "lambda_div_used": 0.6, + "learning_rate": 2.4e-07, + "loss": 0.0, + "reward": -0.039457873441278934, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.039457873441278934, + "reward_after_std": 0.8034725449979305, + "reward_before_mean": 0.32744147814810276, + "reward_before_std": 0.8328801915049553, + "reward_change_max": 0.0006123185157775879, + "reward_change_mean": -0.36689933901652694, + "reward_change_min": -0.8374556675553322, + "reward_change_std": 0.3241370841860771, + "reward_std": 0.8034725710749626, + "rewards/cosine_scaled_reward": -0.19044593471335247, + "rewards/format_reward": 0.7083333358168602, + "step": 12 + }, + { + "advantage_max": 1.7793785631656647, + "advantage_mean": 2.1265199309783434e-08, + "advantage_min": -1.123232178390026, + "advantage_std": 0.9998079463839531, + "completion_length": 2883.3959045410156, + "epoch": 0.014857142857142857, + "grad_norm": 0.30072519183158875, + "kl": 2.9988586902618408e-05, + "lambda_div_used": 0.6, + "learning_rate": 2.6e-07, + "loss": 0.0, + "reward": 0.07007572869770229, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.07007572869770229, + "reward_after_std": 0.6831374354660511, + "reward_before_mean": 0.5178708247840405, + "reward_before_std": 0.6717508621513844, + "reward_change_max": 0.00019785761833190918, + "reward_change_mean": -0.44779507629573345, + "reward_change_min": -0.7652952261269093, + "reward_change_std": 0.3106790967285633, + "reward_std": 0.6831374652683735, + "rewards/cosine_scaled_reward": 0.019352062605321407, + "rewards/format_reward": 0.47916667349636555, + "step": 13 + }, + { + "advantage_max": 1.8502720147371292, + "advantage_mean": 4.2840839431512734e-08, + "advantage_min": -0.8646775856614113, + "advantage_std": 0.9998083263635635, + "completion_length": 2823.9583892822266, + "epoch": 0.016, + "grad_norm": 0.2543714642524719, + "kl": 2.5618821382522583e-05, + "lambda_div_used": 0.6, + "learning_rate": 2.8e-07, + "loss": 0.0, + "reward": -0.055498819798231125, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.055498819798231125, + "reward_after_std": 0.8212512582540512, + "reward_before_mean": 0.2946751266717911, + "reward_before_std": 0.8422470949590206, + "reward_change_max": 0.001776367425918579, + "reward_change_mean": -0.3501739474013448, + "reward_change_min": -0.8158514685928822, + "reward_change_std": 0.3071507504209876, + "reward_std": 0.8212512955069542, + "rewards/cosine_scaled_reward": -0.04016243852674961, + "rewards/format_reward": 0.37500000931322575, + "step": 14 + }, + { + "advantage_max": 1.868079349398613, + "advantage_mean": 1.8316010486074674e-08, + "advantage_min": -0.7876945361495018, + "advantage_std": 0.999783918261528, + "completion_length": 2822.9791717529297, + "epoch": 0.017142857142857144, + "grad_norm": 0.17979443073272705, + "kl": 2.6114284992218018e-05, + "lambda_div_used": 0.6, + "learning_rate": 3e-07, + "loss": 0.0, + "reward": -0.01977388933300972, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.01977388933300972, + "reward_after_std": 0.612056341022253, + "reward_before_mean": 0.38737083226442337, + "reward_before_std": 0.5165360234677792, + "reward_change_max": 0.0003376305103302002, + "reward_change_mean": -0.4071447290480137, + "reward_change_min": -0.7115199901163578, + "reward_change_std": 0.2606779634952545, + "reward_std": 0.6120563521981239, + "rewards/cosine_scaled_reward": 0.006185416132211685, + "rewards/format_reward": 0.3750000037252903, + "step": 15 + }, + { + "advantage_max": 1.8392015546560287, + "advantage_mean": -4.967053768289986e-09, + "advantage_min": -0.8386228755116463, + "advantage_std": 0.9997298642992973, + "completion_length": 3462.1041870117188, + "epoch": 0.018285714285714287, + "grad_norm": 0.2112903892993927, + "kl": 4.178285598754883e-05, + "lambda_div_used": 0.6, + "learning_rate": 3.2e-07, + "loss": 0.0, + "reward": -0.5464260056614876, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.5464260056614876, + "reward_after_std": 0.5875924732536077, + "reward_before_mean": -0.43014006270095706, + "reward_before_std": 0.5928587820380926, + "reward_change_max": 0.0021633952856063843, + "reward_change_mean": -0.11628596065565944, + "reward_change_min": -0.28673963621258736, + "reward_change_std": 0.11779441172257066, + "reward_std": 0.5875924807041883, + "rewards/cosine_scaled_reward": -0.2567367013543844, + "rewards/format_reward": 0.0833333358168602, + "step": 16 + }, + { + "advantage_max": 1.7946833074092865, + "advantage_mean": 7.761021159069514e-09, + "advantage_min": -0.884327657520771, + "advantage_std": 0.9998291656374931, + "completion_length": 2296.708396911621, + "epoch": 0.019428571428571427, + "grad_norm": 0.28919804096221924, + "kl": 3.8780272006988525e-05, + "lambda_div_used": 0.6, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0, + "reward": 0.03589238924905658, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.03589238924905658, + "reward_after_std": 0.7403586134314537, + "reward_before_mean": 0.45668516401201487, + "reward_before_std": 0.7449082098901272, + "reward_change_max": 0.001710943877696991, + "reward_change_mean": -0.4207927817478776, + "reward_change_min": -0.8583872355520725, + "reward_change_std": 0.32544669695198536, + "reward_std": 0.7403586395084858, + "rewards/cosine_scaled_reward": -0.0737407635897398, + "rewards/format_reward": 0.6041666679084301, + "step": 17 + }, + { + "advantage_max": 1.8921531289815903, + "advantage_mean": 2.048909714114089e-08, + "advantage_min": -0.7638939619064331, + "advantage_std": 0.9998648390173912, + "completion_length": 3076.6250610351562, + "epoch": 0.02057142857142857, + "grad_norm": 0.15994614362716675, + "kl": 2.356991171836853e-05, + "lambda_div_used": 0.6, + "learning_rate": 3.6e-07, + "loss": 0.0, + "reward": -0.0722283124923706, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.0722283124923706, + "reward_after_std": 0.9180725961923599, + "reward_before_mean": 0.24214321840554476, + "reward_before_std": 0.9020998664200306, + "reward_change_max": 0.0006986036896705627, + "reward_change_mean": -0.3143715038895607, + "reward_change_min": -0.6540146470069885, + "reward_change_std": 0.26200826931744814, + "reward_std": 0.9180726371705532, + "rewards/cosine_scaled_reward": -0.08726172894239426, + "rewards/format_reward": 0.41666667349636555, + "step": 18 + }, + { + "advantage_max": 1.7503979355096817, + "advantage_mean": 1.4901161193847656e-08, + "advantage_min": -0.8893851488828659, + "advantage_std": 0.9998177886009216, + "completion_length": 2895.6458740234375, + "epoch": 0.021714285714285714, + "grad_norm": 0.17977163195610046, + "kl": 2.0965933799743652e-05, + "lambda_div_used": 0.6, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0, + "reward": 0.20064589567482471, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.20064589567482471, + "reward_after_std": 0.769149724394083, + "reward_before_mean": 0.7063506981357932, + "reward_before_std": 0.7390439650043845, + "reward_change_max": 0.0026439353823661804, + "reward_change_mean": -0.5057047791779041, + "reward_change_min": -0.9240194708108902, + "reward_change_std": 0.3898439407348633, + "reward_std": 0.7691497392952442, + "rewards/cosine_scaled_reward": 0.13442532037151977, + "rewards/format_reward": 0.4375000037252903, + "step": 19 + }, + { + "advantage_max": 1.7905155718326569, + "advantage_mean": 3.1044089521259366e-09, + "advantage_min": -1.0227502509951591, + "advantage_std": 0.9998354762792587, + "completion_length": 2309.041717529297, + "epoch": 0.022857142857142857, + "grad_norm": 0.23434168100357056, + "kl": 1.6529113054275513e-05, + "lambda_div_used": 0.6, + "learning_rate": 4e-07, + "loss": 0.0, + "reward": 0.10582668473944068, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.10582668473944068, + "reward_after_std": 0.8228973932564259, + "reward_before_mean": 0.5439623333513737, + "reward_before_std": 0.8307750299572945, + "reward_change_max": 0.0010927170515060425, + "reward_change_mean": -0.43813565373420715, + "reward_change_min": -0.8274941109120846, + "reward_change_std": 0.34579705353826284, + "reward_std": 0.8228974267840385, + "rewards/cosine_scaled_reward": -0.050935512874275446, + "rewards/format_reward": 0.6458333469927311, + "step": 20 + }, + { + "advantage_max": 1.8194524645805359, + "advantage_mean": 5.525847429632691e-08, + "advantage_min": -0.9087987467646599, + "advantage_std": 0.9997514858841896, + "completion_length": 2732.5833740234375, + "epoch": 0.024, + "grad_norm": 0.25368764996528625, + "kl": 3.4362077713012695e-05, + "lambda_div_used": 0.6, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.0, + "reward": -0.10419294983148575, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.10419294983148575, + "reward_after_std": 0.7973214015364647, + "reward_before_mean": 0.2189039383083582, + "reward_before_std": 0.7819975260645151, + "reward_change_max": 0.0013872236013412476, + "reward_change_mean": -0.3230968825519085, + "reward_change_min": -0.6084227226674557, + "reward_change_std": 0.25168567057698965, + "reward_std": 0.7973214238882065, + "rewards/cosine_scaled_reward": -0.08846470632124692, + "rewards/format_reward": 0.39583334513008595, + "step": 21 + }, + { + "advantage_max": 1.8503218442201614, + "advantage_mean": -1.9247333726823967e-08, + "advantage_min": -0.8788245841860771, + "advantage_std": 0.9998400658369064, + "completion_length": 1828.4166870117188, + "epoch": 0.025142857142857144, + "grad_norm": 0.28431108593940735, + "kl": 3.190571442246437e-05, + "lambda_div_used": 0.6, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0, + "reward": 0.19262014399282634, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.19262014399282634, + "reward_after_std": 0.735573273152113, + "reward_before_mean": 0.6972342263907194, + "reward_before_std": 0.671034948900342, + "reward_change_max": 0.0, + "reward_change_mean": -0.5046140514314175, + "reward_change_min": -0.9077365770936012, + "reward_change_std": 0.35123981535434723, + "reward_std": 0.7355732768774033, + "rewards/cosine_scaled_reward": -0.04721624404191971, + "rewards/format_reward": 0.7916666753590107, + "step": 22 + }, + { + "advantage_max": 1.824779137969017, + "advantage_mean": 1.2417634476236117e-08, + "advantage_min": -0.8828606568276882, + "advantage_std": 0.9998202919960022, + "completion_length": 2416.9792098999023, + "epoch": 0.026285714285714287, + "grad_norm": 0.21872000396251678, + "kl": 3.149360418319702e-05, + "lambda_div_used": 0.6, + "learning_rate": 4.6e-07, + "loss": 0.0, + "reward": -0.10515248589217663, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.10515248589217663, + "reward_after_std": 0.7565237432718277, + "reward_before_mean": 0.22834152821451426, + "reward_before_std": 0.7646547462791204, + "reward_change_max": 0.0011374279856681824, + "reward_change_mean": -0.3334940178319812, + "reward_change_min": -0.7022706530988216, + "reward_change_std": 0.28038450982421637, + "reward_std": 0.7565237805247307, + "rewards/cosine_scaled_reward": -0.13582924474030733, + "rewards/format_reward": 0.5000000074505806, + "step": 23 + }, + { + "advantage_max": 1.7469995766878128, + "advantage_mean": 2.9491878938969762e-09, + "advantage_min": -1.066501371562481, + "advantage_std": 0.9998574405908585, + "completion_length": 2877.500045776367, + "epoch": 0.027428571428571427, + "grad_norm": 0.24179069697856903, + "kl": 3.087148070335388e-05, + "lambda_div_used": 0.6, + "learning_rate": 4.8e-07, + "loss": 0.0, + "reward": 0.08253479516133666, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.08253479516133666, + "reward_after_std": 0.9543181359767914, + "reward_before_mean": 0.48664891347289085, + "reward_before_std": 1.0185184814035892, + "reward_change_max": 0.0005891844630241394, + "reward_change_mean": -0.4041141299530864, + "reward_change_min": -0.8701234757900238, + "reward_change_std": 0.36574244499206543, + "reward_std": 0.954318180680275, + "rewards/cosine_scaled_reward": 0.0037411183584481478, + "rewards/format_reward": 0.4791666828095913, + "step": 24 + }, + { + "advantage_max": 1.7304434180259705, + "advantage_mean": 6.332993629509787e-08, + "advantage_min": -1.050755836069584, + "advantage_std": 0.9998001903295517, + "completion_length": 2825.6458740234375, + "epoch": 0.02857142857142857, + "grad_norm": 0.2251826375722885, + "kl": 3.547966480255127e-05, + "lambda_div_used": 0.6, + "learning_rate": 5e-07, + "loss": 0.0, + "reward": -0.13675972539931536, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.13675972539931536, + "reward_after_std": 0.7334093153476715, + "reward_before_mean": 0.18887527519837022, + "reward_before_std": 0.7841606214642525, + "reward_change_max": 0.0004205778241157532, + "reward_change_mean": -0.3256350150331855, + "reward_change_min": -0.6350610516965389, + "reward_change_std": 0.2870927806943655, + "reward_std": 0.7334093227982521, + "rewards/cosine_scaled_reward": -0.08264568448066711, + "rewards/format_reward": 0.3541666716337204, + "step": 25 + }, + { + "advantage_max": 1.8000003397464752, + "advantage_mean": 5.5879355587151736e-08, + "advantage_min": -0.9525649920105934, + "advantage_std": 0.9997768253087997, + "completion_length": 2798.562515258789, + "epoch": 0.029714285714285714, + "grad_norm": 0.16408225893974304, + "kl": 3.130175173282623e-05, + "lambda_div_used": 0.6, + "learning_rate": 5.2e-07, + "loss": 0.0, + "reward": 0.019447185564786196, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.019447185564786196, + "reward_after_std": 0.5030231699347496, + "reward_before_mean": 0.4766043610870838, + "reward_before_std": 0.41719701141119003, + "reward_change_max": 0.00043053925037384033, + "reward_change_mean": -0.45715717040002346, + "reward_change_min": -0.6959591060876846, + "reward_change_std": 0.2762450519949198, + "reward_std": 0.5030231848359108, + "rewards/cosine_scaled_reward": -0.03253114968538284, + "rewards/format_reward": 0.5416666679084301, + "step": 26 + }, + { + "advantage_max": 1.779951274394989, + "advantage_mean": -6.208815128694312e-10, + "advantage_min": -0.8513502217829227, + "advantage_std": 0.999822735786438, + "completion_length": 3026.875045776367, + "epoch": 0.030857142857142857, + "grad_norm": 0.1849300116300583, + "kl": 3.397837281227112e-05, + "lambda_div_used": 0.6, + "learning_rate": 5.4e-07, + "loss": 0.0, + "reward": -0.10913681925740093, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.10913681925740093, + "reward_after_std": 0.7921761274337769, + "reward_before_mean": 0.22155018523335457, + "reward_before_std": 0.8362809754908085, + "reward_change_max": 0.00031948089599609375, + "reward_change_mean": -0.3306870339438319, + "reward_change_min": -0.7002861388027668, + "reward_change_std": 0.2832873035222292, + "reward_std": 0.7921761721372604, + "rewards/cosine_scaled_reward": -0.08714157156646252, + "rewards/format_reward": 0.39583333767950535, + "step": 27 + }, + { + "advantage_max": 1.7814081907272339, + "advantage_mean": 1.2417630257388623e-09, + "advantage_min": -0.9797869324684143, + "advantage_std": 0.9997996240854263, + "completion_length": 2828.3542098999023, + "epoch": 0.032, + "grad_norm": 0.19132548570632935, + "kl": 2.2899359464645386e-05, + "lambda_div_used": 0.6, + "learning_rate": 5.6e-07, + "loss": 0.0, + "reward": -0.09824594110250473, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.09824594110250473, + "reward_after_std": 0.680554173886776, + "reward_before_mean": 0.2582579329609871, + "reward_before_std": 0.6859657093882561, + "reward_change_max": 0.0011489912867546082, + "reward_change_mean": -0.3565038787201047, + "reward_change_min": -0.7232412360608578, + "reward_change_std": 0.2864949721843004, + "reward_std": 0.6805541850626469, + "rewards/cosine_scaled_reward": -0.0687877181917429, + "rewards/format_reward": 0.39583334513008595, + "step": 28 + }, + { + "advantage_max": 1.8605255335569382, + "advantage_mean": 7.202228036184977e-08, + "advantage_min": -0.8301983177661896, + "advantage_std": 0.9997770339250565, + "completion_length": 3273.7083587646484, + "epoch": 0.03314285714285714, + "grad_norm": 0.18308158218860626, + "kl": 1.9058585166931152e-05, + "lambda_div_used": 0.6, + "learning_rate": 5.8e-07, + "loss": 0.0, + "reward": -0.47089114785194397, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.47089114785194397, + "reward_after_std": 0.6084134839475155, + "reward_before_mean": -0.31675157556310296, + "reward_before_std": 0.6089425943791866, + "reward_change_max": 0.0014341697096824646, + "reward_change_mean": -0.1541395653039217, + "reward_change_min": -0.35405242815613747, + "reward_change_std": 0.14612348517403007, + "reward_std": 0.608413502573967, + "rewards/cosine_scaled_reward": -0.26254245825111866, + "rewards/format_reward": 0.20833333767950535, + "step": 29 + }, + { + "advantage_max": 1.8005549013614655, + "advantage_mean": -9.934107758624577e-09, + "advantage_min": -0.9672626964747906, + "advantage_std": 0.9998631924390793, + "completion_length": 2901.6458892822266, + "epoch": 0.03428571428571429, + "grad_norm": 0.16869448125362396, + "kl": 2.5976449251174927e-05, + "lambda_div_used": 0.6, + "learning_rate": 6e-07, + "loss": 0.0, + "reward": 0.08703623432666063, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.08703623432666063, + "reward_after_std": 0.962010782212019, + "reward_before_mean": 0.49391913414001465, + "reward_before_std": 1.0108840502798557, + "reward_change_max": 0.00033224374055862427, + "reward_change_mean": -0.40688287653028965, + "reward_change_min": -0.8312213607132435, + "reward_change_std": 0.3514333504717797, + "reward_std": 0.962010845541954, + "rewards/cosine_scaled_reward": -0.013457119464874268, + "rewards/format_reward": 0.5208333469927311, + "step": 30 + }, + { + "advantage_max": 1.791318416595459, + "advantage_mean": 1.2417632477834672e-09, + "advantage_min": -0.8292011320590973, + "advantage_std": 0.9998136609792709, + "completion_length": 2891.5833740234375, + "epoch": 0.03542857142857143, + "grad_norm": 0.20195633172988892, + "kl": 3.875233232975006e-05, + "lambda_div_used": 0.6, + "learning_rate": 6.2e-07, + "loss": 0.0, + "reward": -0.1924915760755539, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.1924915760755539, + "reward_after_std": 0.7389788702130318, + "reward_before_mean": 0.10049068834632635, + "reward_before_std": 0.7730084210634232, + "reward_change_max": 0.0006143301725387573, + "reward_change_mean": -0.29298229329288006, + "reward_change_min": -0.633475948125124, + "reward_change_std": 0.26671546418219805, + "reward_std": 0.7389788739383221, + "rewards/cosine_scaled_reward": -0.1268379855901003, + "rewards/format_reward": 0.3541666679084301, + "step": 31 + }, + { + "advantage_max": 1.7208815962076187, + "advantage_mean": 2.4835269396561444e-08, + "advantage_min": -1.1677290424704552, + "advantage_std": 0.9997886344790459, + "completion_length": 3198.979202270508, + "epoch": 0.036571428571428574, + "grad_norm": 0.18060676753520966, + "kl": 2.734363079071045e-05, + "lambda_div_used": 0.6, + "learning_rate": 6.4e-07, + "loss": 0.0, + "reward": -0.02893495187163353, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.02893495187163353, + "reward_after_std": 0.6704957820475101, + "reward_before_mean": 0.37428003549575806, + "reward_before_std": 0.7028882801532745, + "reward_change_max": 0.0005309507250785828, + "reward_change_mean": -0.40321500319987535, + "reward_change_min": -0.6910530626773834, + "reward_change_std": 0.29959324561059475, + "reward_std": 0.6704958230257034, + "rewards/cosine_scaled_reward": 0.020473352633416653, + "rewards/format_reward": 0.33333334885537624, + "step": 32 + }, + { + "advantage_max": 1.7768725603818893, + "advantage_mean": 4.967053879312289e-09, + "advantage_min": -0.8517647087574005, + "advantage_std": 0.9997695460915565, + "completion_length": 3366.2708740234375, + "epoch": 0.037714285714285714, + "grad_norm": 0.1567767709493637, + "kl": 3.699958324432373e-05, + "lambda_div_used": 0.6, + "learning_rate": 6.6e-07, + "loss": 0.0, + "reward": -0.2785487826913595, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.2785487826913595, + "reward_after_std": 0.658524302765727, + "reward_before_mean": -0.01756126992404461, + "reward_before_std": 0.6902536172419786, + "reward_change_max": 0.001083478331565857, + "reward_change_mean": -0.2609875090420246, + "reward_change_min": -0.6648754775524139, + "reward_change_std": 0.2532107653096318, + "reward_std": 0.6585243381559849, + "rewards/cosine_scaled_reward": -0.13378064148128033, + "rewards/format_reward": 0.2500000037252903, + "step": 33 + }, + { + "advantage_max": 1.742632418870926, + "advantage_mean": -3.725290742551124e-09, + "advantage_min": -1.0178842395544052, + "advantage_std": 0.9998548254370689, + "completion_length": 2442.2708740234375, + "epoch": 0.038857142857142854, + "grad_norm": 0.26763319969177246, + "kl": 3.138929605484009e-05, + "lambda_div_used": 0.6, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0, + "reward": 0.2606486789882183, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2606486789882183, + "reward_after_std": 0.9495027400553226, + "reward_before_mean": 0.7638577073812485, + "reward_before_std": 0.9792901389300823, + "reward_change_max": 0.0011056289076805115, + "reward_change_mean": -0.503209053305909, + "reward_change_min": -0.9927790202200413, + "reward_change_std": 0.382378701120615, + "reward_std": 0.9495027586817741, + "rewards/cosine_scaled_reward": 0.09026219043880701, + "rewards/format_reward": 0.5833333358168602, + "step": 34 + }, + { + "advantage_max": 1.7523992359638214, + "advantage_mean": 1.3659398279131096e-08, + "advantage_min": -1.0934018939733505, + "advantage_std": 0.9998025223612785, + "completion_length": 2960.7083740234375, + "epoch": 0.04, + "grad_norm": 0.2008289247751236, + "kl": 3.193691372871399e-05, + "lambda_div_used": 0.6, + "learning_rate": 7e-07, + "loss": 0.0, + "reward": -0.1078006811439991, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.1078006811439991, + "reward_after_std": 0.7199197486042976, + "reward_before_mean": 0.23942278884351254, + "reward_before_std": 0.7542877979576588, + "reward_change_max": 0.0, + "reward_change_mean": -0.34722347371280193, + "reward_change_min": -0.6703316308557987, + "reward_change_std": 0.29183086566627026, + "reward_std": 0.7199197895824909, + "rewards/cosine_scaled_reward": -0.0782052765134722, + "rewards/format_reward": 0.3958333395421505, + "step": 35 + }, + { + "advantage_max": 1.8357672542333603, + "advantage_mean": 5.58793539218172e-09, + "advantage_min": -0.892547219991684, + "advantage_std": 0.9997570440173149, + "completion_length": 3353.3958740234375, + "epoch": 0.04114285714285714, + "grad_norm": 0.1734616905450821, + "kl": 4.410743713378906e-05, + "lambda_div_used": 0.6, + "learning_rate": 7.2e-07, + "loss": 0.0, + "reward": -0.41738917178008705, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.41738917178008705, + "reward_after_std": 0.5450928043574095, + "reward_before_mean": -0.21742304414510727, + "reward_before_std": 0.5365421585738659, + "reward_change_max": 0.0034711062908172607, + "reward_change_mean": -0.1999661261215806, + "reward_change_min": -0.4123280607163906, + "reward_change_std": 0.16749785374850035, + "reward_std": 0.5450928211212158, + "rewards/cosine_scaled_reward": -0.20246152579784393, + "rewards/format_reward": 0.1875000074505806, + "step": 36 + }, + { + "advantage_max": 1.7173901945352554, + "advantage_mean": 2.0489096086429015e-08, + "advantage_min": -1.0507243163883686, + "advantage_std": 0.9997305124998093, + "completion_length": 3307.6666870117188, + "epoch": 0.04228571428571429, + "grad_norm": 0.16524305939674377, + "kl": 2.351030707359314e-05, + "lambda_div_used": 0.6, + "learning_rate": 7.4e-07, + "loss": 0.0, + "reward": -0.45867132768034935, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.45867132768034935, + "reward_after_std": 0.3930114023387432, + "reward_before_mean": -0.24169572815299034, + "reward_before_std": 0.4065242074429989, + "reward_change_max": 0.00047623366117477417, + "reward_change_mean": -0.21697559859603643, + "reward_change_min": -0.3991989456117153, + "reward_change_std": 0.16736689116805792, + "reward_std": 0.3930114172399044, + "rewards/cosine_scaled_reward": -0.22501453291624784, + "rewards/format_reward": 0.2083333358168602, + "step": 37 + }, + { + "advantage_max": 1.7465081810951233, + "advantage_mean": 2.6077033643545633e-08, + "advantage_min": -0.9350218996405602, + "advantage_std": 0.9997123703360558, + "completion_length": 3334.8333435058594, + "epoch": 0.04342857142857143, + "grad_norm": 0.19420531392097473, + "kl": 3.1050294637680054e-05, + "lambda_div_used": 0.6, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0, + "reward": -0.4392921030521393, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.4392921030521393, + "reward_after_std": 0.4309826772660017, + "reward_before_mean": -0.22022299468517303, + "reward_before_std": 0.4509555771946907, + "reward_change_max": 0.001096479594707489, + "reward_change_mean": -0.21906912024132907, + "reward_change_min": -0.43532417714595795, + "reward_change_std": 0.180083560757339, + "reward_std": 0.43098269030451775, + "rewards/cosine_scaled_reward": -0.16219483315944672, + "rewards/format_reward": 0.1041666716337204, + "step": 38 + }, + { + "advantage_max": 1.809293732047081, + "advantage_mean": 2.2351744011217534e-08, + "advantage_min": -0.9606939628720284, + "advantage_std": 0.9998093545436859, + "completion_length": 2840.895866394043, + "epoch": 0.044571428571428574, + "grad_norm": 0.2235952764749527, + "kl": 2.9481947422027588e-05, + "lambda_div_used": 0.6, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0, + "reward": 0.018166373018175364, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.018166373018175364, + "reward_after_std": 0.7264602929353714, + "reward_before_mean": 0.42779126949608326, + "reward_before_std": 0.6920085242018104, + "reward_change_max": 0.002504482865333557, + "reward_change_mean": -0.409624888561666, + "reward_change_min": -0.7010375969111919, + "reward_change_std": 0.29587725829333067, + "reward_std": 0.726460300385952, + "rewards/cosine_scaled_reward": -0.01527103316038847, + "rewards/format_reward": 0.45833334513008595, + "step": 39 + }, + { + "advantage_max": 1.8042074739933014, + "advantage_mean": -7.450580263856921e-09, + "advantage_min": -1.0113056749105453, + "advantage_std": 0.9997925981879234, + "completion_length": 2649.5625228881836, + "epoch": 0.045714285714285714, + "grad_norm": 0.21060492098331451, + "kl": 4.390254616737366e-05, + "lambda_div_used": 0.6, + "learning_rate": 8e-07, + "loss": 0.0, + "reward": -0.13332401355728507, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.13332401355728507, + "reward_after_std": 0.5301584340631962, + "reward_before_mean": 0.23096877778880298, + "reward_before_std": 0.4900096170604229, + "reward_change_max": 0.001264527440071106, + "reward_change_mean": -0.3642928283661604, + "reward_change_min": -0.6233471073210239, + "reward_change_std": 0.25746062211692333, + "reward_std": 0.5301584452390671, + "rewards/cosine_scaled_reward": -0.12409894913434982, + "rewards/format_reward": 0.4791666716337204, + "step": 40 + }, + { + "advantage_max": 1.8728558719158173, + "advantage_mean": 3.104408707876871e-08, + "advantage_min": -0.8173846006393433, + "advantage_std": 0.9997896775603294, + "completion_length": 3017.5416870117188, + "epoch": 0.046857142857142854, + "grad_norm": 0.16901332139968872, + "kl": 3.6947429180145264e-05, + "lambda_div_used": 0.6, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0, + "reward": -0.24943608665489592, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.24943608665489592, + "reward_after_std": 0.6227637939155102, + "reward_before_mean": 0.025789468549191952, + "reward_before_std": 0.586168160662055, + "reward_change_max": 0.0007337778806686401, + "reward_change_mean": -0.2752255443483591, + "reward_change_min": -0.5554584003984928, + "reward_change_std": 0.19899009726941586, + "reward_std": 0.6227638311684132, + "rewards/cosine_scaled_reward": -0.1850219412590377, + "rewards/format_reward": 0.3958333358168602, + "step": 41 + }, + { + "advantage_max": 1.8519198596477509, + "advantage_mean": -6.208818792430293e-09, + "advantage_min": -0.8044792786240578, + "advantage_std": 0.9996784925460815, + "completion_length": 2845.4583435058594, + "epoch": 0.048, + "grad_norm": 0.3022547662258148, + "kl": 4.138052463531494e-05, + "lambda_div_used": 0.6, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0, + "reward": -0.5167930386960506, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.5167930386960506, + "reward_after_std": 0.36160215362906456, + "reward_before_mean": -0.33381566777825356, + "reward_before_std": 0.3440868891775608, + "reward_change_max": 0.0013512372970581055, + "reward_change_mean": -0.18297737976536155, + "reward_change_min": -0.3526080325245857, + "reward_change_std": 0.14170236326754093, + "reward_std": 0.36160216107964516, + "rewards/cosine_scaled_reward": -0.3335745017975569, + "rewards/format_reward": 0.3333333358168602, + "step": 42 + }, + { + "advantage_max": 1.7674841284751892, + "advantage_mean": 4.221995697495373e-08, + "advantage_min": -0.9594441875815392, + "advantage_std": 0.9998220577836037, + "completion_length": 2943.750045776367, + "epoch": 0.04914285714285714, + "grad_norm": 0.19866588711738586, + "kl": 4.484504461288452e-05, + "lambda_div_used": 0.6, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0, + "reward": -0.17242285422980785, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.17242285422980785, + "reward_after_std": 0.7026234902441502, + "reward_before_mean": 0.13483278080821037, + "reward_before_std": 0.7139367610216141, + "reward_change_max": 0.0017592236399650574, + "reward_change_mean": -0.3072556145489216, + "reward_change_min": -0.6003884114325047, + "reward_change_std": 0.24329772219061852, + "reward_std": 0.7026234939694405, + "rewards/cosine_scaled_reward": -0.07841694308444858, + "rewards/format_reward": 0.2916666679084301, + "step": 43 + }, + { + "advantage_max": 1.7240224331617355, + "advantage_mean": 3.7873785108111235e-08, + "advantage_min": -1.0484469085931778, + "advantage_std": 0.9998211860656738, + "completion_length": 2802.4583740234375, + "epoch": 0.05028571428571429, + "grad_norm": 0.236885666847229, + "kl": 0.00011239200830459595, + "lambda_div_used": 0.6, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0, + "reward": 0.020315666333772242, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.020315666333772242, + "reward_after_std": 0.7511612996459007, + "reward_before_mean": 0.43610192835330963, + "reward_before_std": 0.8198062926530838, + "reward_change_max": 0.0011220425367355347, + "reward_change_mean": -0.41578628378920257, + "reward_change_min": -0.8347551226615906, + "reward_change_std": 0.3477799710817635, + "reward_std": 0.7511613145470619, + "rewards/cosine_scaled_reward": -0.02153235487639904, + "rewards/format_reward": 0.4791666828095913, + "step": 44 + }, + { + "advantage_max": 1.8154965788125992, + "advantage_mean": 1.3348956495740083e-08, + "advantage_min": -0.9399640262126923, + "advantage_std": 0.9997976571321487, + "completion_length": 3300.5000610351562, + "epoch": 0.05142857142857143, + "grad_norm": 0.14654569327831268, + "kl": 4.844926297664642e-05, + "lambda_div_used": 0.6, + "learning_rate": 9e-07, + "loss": 0.0, + "reward": -0.027277782559394836, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.027277782559394836, + "reward_after_std": 0.7254571802914143, + "reward_before_mean": 0.3635289091616869, + "reward_before_std": 0.7658616602420807, + "reward_change_max": 0.0018416717648506165, + "reward_change_mean": -0.39080664864741266, + "reward_change_min": -0.7719079181551933, + "reward_change_std": 0.31446282705292106, + "reward_std": 0.7254571970552206, + "rewards/cosine_scaled_reward": -0.005735563114285469, + "rewards/format_reward": 0.37500000558793545, + "step": 45 + }, + { + "advantage_max": 1.7028233408927917, + "advantage_mean": 7.512669064624333e-08, + "advantage_min": -1.0628773123025894, + "advantage_std": 0.9997040554881096, + "completion_length": 3280.1458435058594, + "epoch": 0.052571428571428575, + "grad_norm": 0.1873319298028946, + "kl": 8.474476635456085e-05, + "lambda_div_used": 0.6, + "learning_rate": 9.2e-07, + "loss": 0.0, + "reward": -0.529501348733902, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.529501348733902, + "reward_after_std": 0.3474753201007843, + "reward_before_mean": -0.3440132327377796, + "reward_before_std": 0.36735135316848755, + "reward_change_max": 0.0011699274182319641, + "reward_change_mean": -0.18548810062929988, + "reward_change_min": -0.37992362678050995, + "reward_change_std": 0.15657844487577677, + "reward_std": 0.3474753201007843, + "rewards/cosine_scaled_reward": -0.24492328986525536, + "rewards/format_reward": 0.14583333395421505, + "step": 46 + }, + { + "advantage_max": 1.727543666958809, + "advantage_mean": -1.4901161637936866e-08, + "advantage_min": -0.9817303344607353, + "advantage_std": 0.9998201057314873, + "completion_length": 2743.1042098999023, + "epoch": 0.053714285714285714, + "grad_norm": 0.20242835581302643, + "kl": 4.671327769756317e-05, + "lambda_div_used": 0.6, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0, + "reward": 0.17383363377302885, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.17383363377302885, + "reward_after_std": 0.8287919256836176, + "reward_before_mean": 0.6626354195177555, + "reward_before_std": 0.8962942529469728, + "reward_change_max": 0.0017872899770736694, + "reward_change_mean": -0.48880180856212974, + "reward_change_min": -0.9589711390435696, + "reward_change_std": 0.40080668311566114, + "reward_std": 0.8287919294089079, + "rewards/cosine_scaled_reward": 0.060484373942017555, + "rewards/format_reward": 0.5416666716337204, + "step": 47 + }, + { + "advantage_max": 1.8530944287776947, + "advantage_mean": 1.2417634698280722e-08, + "advantage_min": -0.8744976595044136, + "advantage_std": 0.9998347014188766, + "completion_length": 2797.2291870117188, + "epoch": 0.054857142857142854, + "grad_norm": 0.220950186252594, + "kl": 0.0001786835491657257, + "lambda_div_used": 0.6, + "learning_rate": 9.6e-07, + "loss": 0.0, + "reward": -0.050193486735224724, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.050193486735224724, + "reward_after_std": 0.9647956900298595, + "reward_before_mean": 0.25988880917429924, + "reward_before_std": 0.9481890201568604, + "reward_change_max": 0.0005988627672195435, + "reward_change_mean": -0.3100823136046529, + "reward_change_min": -0.6051475256681442, + "reward_change_std": 0.23567491210997105, + "reward_std": 0.9647957049310207, + "rewards/cosine_scaled_reward": -0.05755559680983424, + "rewards/format_reward": 0.3750000037252903, + "step": 48 + }, + { + "advantage_max": 1.8145934343338013, + "advantage_mean": 2.9026220982331097e-08, + "advantage_min": -0.9026520848274231, + "advantage_std": 0.9998468682169914, + "completion_length": 2319.1875381469727, + "epoch": 0.056, + "grad_norm": 0.20816369354724884, + "kl": 0.00010545924305915833, + "lambda_div_used": 0.6, + "learning_rate": 9.8e-07, + "loss": 0.0, + "reward": 0.14474806562066078, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.14474806562066078, + "reward_after_std": 0.8051043152809143, + "reward_before_mean": 0.6113051008433104, + "reward_before_std": 0.7825077921152115, + "reward_change_max": 0.0, + "reward_change_mean": -0.46655698027461767, + "reward_change_min": -0.8942185193300247, + "reward_change_std": 0.33377677015960217, + "reward_std": 0.8051043301820755, + "rewards/cosine_scaled_reward": -0.006847476586699486, + "rewards/format_reward": 0.6250000093132257, + "step": 49 + }, + { + "advantage_max": 1.889391914010048, + "advantage_mean": -1.676380800841315e-08, + "advantage_min": -0.8229124620556831, + "advantage_std": 0.9997017830610275, + "completion_length": 3018.937511444092, + "epoch": 0.05714285714285714, + "grad_norm": 0.15827570855617523, + "kl": 0.00010543933603912592, + "lambda_div_used": 0.6, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": -0.03166187182068825, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.03166187182068825, + "reward_after_std": 0.5302102379500866, + "reward_before_mean": 0.39034452475607395, + "reward_before_std": 0.4515397949144244, + "reward_change_max": 0.0, + "reward_change_mean": -0.4220064301043749, + "reward_change_min": -0.6827008239924908, + "reward_change_std": 0.27064335346221924, + "reward_std": 0.5302102454006672, + "rewards/cosine_scaled_reward": 0.02850560611113906, + "rewards/format_reward": 0.33333333395421505, + "step": 50 + }, + { + "advantage_max": 1.7789610773324966, + "advantage_mean": 3.166496753692627e-08, + "advantage_min": -0.9067145064473152, + "advantage_std": 0.9997837692499161, + "completion_length": 2335.083351135254, + "epoch": 0.05828571428571429, + "grad_norm": 0.24217447638511658, + "kl": 0.00023105740547180176, + "lambda_div_used": 0.6, + "learning_rate": 9.999890338174275e-07, + "loss": 0.0, + "reward": -0.08883536350913346, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.08883536350913346, + "reward_after_std": 0.6687220390886068, + "reward_before_mean": 0.2738766521215439, + "reward_before_std": 0.6639172211289406, + "reward_change_max": 0.0006502941250801086, + "reward_change_mean": -0.36271203216165304, + "reward_change_min": -0.7275662198662758, + "reward_change_std": 0.2778544398024678, + "reward_std": 0.6687220819294453, + "rewards/cosine_scaled_reward": -0.09222834184765816, + "rewards/format_reward": 0.4583333358168602, + "step": 51 + }, + { + "advantage_max": 1.73988875746727, + "advantage_mean": 1.707424765462484e-08, + "advantage_min": -1.0492620393633842, + "advantage_std": 0.9997946247458458, + "completion_length": 2937.270835876465, + "epoch": 0.05942857142857143, + "grad_norm": 0.19648152589797974, + "kl": 0.00017854571342468262, + "lambda_div_used": 0.6, + "learning_rate": 9.999561358041868e-07, + "loss": 0.0, + "reward": 0.03505727555602789, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.03505727555602789, + "reward_after_std": 0.8051403667777777, + "reward_before_mean": 0.4398728273808956, + "reward_before_std": 0.8179160077124834, + "reward_change_max": 0.0011621415615081787, + "reward_change_mean": -0.4048155304044485, + "reward_change_min": -0.7212616428732872, + "reward_change_std": 0.30738101061433554, + "reward_std": 0.8051404003053904, + "rewards/cosine_scaled_reward": 0.011603066697716713, + "rewards/format_reward": 0.41666667722165585, + "step": 52 + }, + { + "advantage_max": 1.6888093948364258, + "advantage_mean": 8.692344288796505e-09, + "advantage_min": -1.1568487137556076, + "advantage_std": 0.9998311847448349, + "completion_length": 2733.479263305664, + "epoch": 0.060571428571428575, + "grad_norm": 0.19831141829490662, + "kl": 0.00014853477478027344, + "lambda_div_used": 0.6, + "learning_rate": 9.999013075636804e-07, + "loss": 0.0, + "reward": 0.11825260240584612, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11825260240584612, + "reward_after_std": 0.7896803952753544, + "reward_before_mean": 0.5833198018372059, + "reward_before_std": 0.8498901575803757, + "reward_change_max": 0.0, + "reward_change_mean": -0.46506719663739204, + "reward_change_min": -0.9060896001756191, + "reward_change_std": 0.37568887136876583, + "reward_std": 0.7896804176270962, + "rewards/cosine_scaled_reward": -0.010423431638628244, + "rewards/format_reward": 0.604166679084301, + "step": 53 + }, + { + "advantage_max": 1.8639821112155914, + "advantage_mean": 1.2417634698280722e-08, + "advantage_min": -0.8045392781496048, + "advantage_std": 0.9998516067862511, + "completion_length": 2762.8959350585938, + "epoch": 0.061714285714285715, + "grad_norm": 0.16911473870277405, + "kl": 8.419156074523926e-05, + "lambda_div_used": 0.6, + "learning_rate": 9.998245517681593e-07, + "loss": 0.0, + "reward": 0.3884635865688324, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3884635865688324, + "reward_after_std": 0.857769999653101, + "reward_before_mean": 0.974736912176013, + "reward_before_std": 0.769689017906785, + "reward_change_max": 0.0013243556022644043, + "reward_change_mean": -0.5862732660025358, + "reward_change_min": -1.0673403665423393, + "reward_change_std": 0.4096502447500825, + "reward_std": 0.8577700182795525, + "rewards/cosine_scaled_reward": 0.19570175930857658, + "rewards/format_reward": 0.5833333414047956, + "step": 54 + }, + { + "advantage_max": 1.8454468548297882, + "advantage_mean": 1.024454790443663e-08, + "advantage_min": -0.8249358907341957, + "advantage_std": 0.9998388141393661, + "completion_length": 2930.6667251586914, + "epoch": 0.06285714285714286, + "grad_norm": 0.17054994404315948, + "kl": 0.00012383796274662018, + "lambda_div_used": 0.6, + "learning_rate": 9.997258721585931e-07, + "loss": 0.0, + "reward": 0.030168408062309027, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.030168408062309027, + "reward_after_std": 0.8526755161583424, + "reward_before_mean": 0.41612496972084045, + "reward_before_std": 0.8229849487543106, + "reward_change_max": 0.0006914958357810974, + "reward_change_mean": -0.38595652766525745, + "reward_change_min": -0.6950371004641056, + "reward_change_std": 0.288301445543766, + "reward_std": 0.8526755459606647, + "rewards/cosine_scaled_reward": 0.020562471821904182, + "rewards/format_reward": 0.3750000074505806, + "step": 55 + }, + { + "advantage_max": 1.8175309002399445, + "advantage_mean": 3.725290476097598e-08, + "advantage_min": -0.8730312213301659, + "advantage_std": 0.9997734501957893, + "completion_length": 2981.937545776367, + "epoch": 0.064, + "grad_norm": 0.16157382726669312, + "kl": 6.651878356933594e-05, + "lambda_div_used": 0.6, + "learning_rate": 9.996052735444862e-07, + "loss": 0.0, + "reward": -0.13588544633239508, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.13588544633239508, + "reward_after_std": 0.5623712055385113, + "reward_before_mean": 0.22082094359211624, + "reward_before_std": 0.5218833908438683, + "reward_change_max": 0.00018957257270812988, + "reward_change_mean": -0.3567063990049064, + "reward_change_min": -0.647631298750639, + "reward_change_std": 0.25431928131729364, + "reward_std": 0.5623712316155434, + "rewards/cosine_scaled_reward": -0.09792286064475775, + "rewards/format_reward": 0.4166666716337204, + "step": 56 + }, + { + "advantage_max": 1.9298505038022995, + "advantage_mean": 7.947286273513043e-08, + "advantage_min": -0.7080076560378075, + "advantage_std": 0.9997479617595673, + "completion_length": 3066.875030517578, + "epoch": 0.06514285714285714, + "grad_norm": 0.1452140212059021, + "kl": 6.233155727386475e-05, + "lambda_div_used": 0.6, + "learning_rate": 9.994627618036452e-07, + "loss": 0.0, + "reward": -0.2912606264776514, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2912606264776514, + "reward_after_std": 0.747662709094584, + "reward_before_mean": -0.06645372789353132, + "reward_before_std": 0.7131164316087961, + "reward_change_max": 0.0018123239278793335, + "reward_change_mean": -0.22480689152143896, + "reward_change_min": -0.4902408979833126, + "reward_change_std": 0.18422507378272712, + "reward_std": 0.7476627435535192, + "rewards/cosine_scaled_reward": -0.1998935411684215, + "rewards/format_reward": 0.33333334140479565, + "step": 57 + }, + { + "advantage_max": 1.8031867444515228, + "advantage_mean": 8.692344177774203e-09, + "advantage_min": -1.000834882259369, + "advantage_std": 0.9998651817440987, + "completion_length": 2100.375030517578, + "epoch": 0.06628571428571428, + "grad_norm": 0.2042514830827713, + "kl": 0.0006186068058013916, + "lambda_div_used": 0.6, + "learning_rate": 9.992983438818915e-07, + "loss": 0.0, + "reward": 0.3083833637647331, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3083833637647331, + "reward_after_std": 0.9442720264196396, + "reward_before_mean": 0.8355601169168949, + "reward_before_std": 0.92253103479743, + "reward_change_max": 0.0006568878889083862, + "reward_change_mean": -0.5271767731755972, + "reward_change_min": -0.9465693905949593, + "reward_change_std": 0.38795287534594536, + "reward_std": 0.9442720338702202, + "rewards/cosine_scaled_reward": 0.053196728229522705, + "rewards/format_reward": 0.7291666716337204, + "step": 58 + }, + { + "advantage_max": 1.7461483031511307, + "advantage_mean": 3.0267983897047657e-08, + "advantage_min": -1.000838428735733, + "advantage_std": 0.9997114017605782, + "completion_length": 2875.6666717529297, + "epoch": 0.06742857142857143, + "grad_norm": 0.16439005732536316, + "kl": 3.9380043745040894e-05, + "lambda_div_used": 0.6, + "learning_rate": 9.991120277927223e-07, + "loss": 0.0, + "reward": -0.19200942106544971, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.19200942106544971, + "reward_after_std": 0.550924139097333, + "reward_before_mean": 0.14425736293196678, + "reward_before_std": 0.5736105926334858, + "reward_change_max": 0.0010673105716705322, + "reward_change_mean": -0.33626677468419075, + "reward_change_min": -0.6364939995110035, + "reward_change_std": 0.2666931441053748, + "reward_std": 0.5509241484105587, + "rewards/cosine_scaled_reward": -0.08412131667137146, + "rewards/format_reward": 0.3125, + "step": 59 + }, + { + "advantage_max": 1.8866003900766373, + "advantage_mean": 1.0554989549049765e-08, + "advantage_min": -0.8034778423607349, + "advantage_std": 0.9998001158237457, + "completion_length": 3011.6666870117188, + "epoch": 0.06857142857142857, + "grad_norm": 0.1728099286556244, + "kl": 8.880347013473511e-05, + "lambda_div_used": 0.6, + "learning_rate": 9.989038226169207e-07, + "loss": 0.0, + "reward": -0.2994096493348479, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2994096493348479, + "reward_after_std": 0.6945554278790951, + "reward_before_mean": -0.06937994388863444, + "reward_before_std": 0.6422449797391891, + "reward_change_max": 0.0005838647484779358, + "reward_change_mean": -0.23002970311790705, + "reward_change_min": -0.4234812743961811, + "reward_change_std": 0.16400552168488503, + "reward_std": 0.6945554576814175, + "rewards/cosine_scaled_reward": -0.1909399749711156, + "rewards/format_reward": 0.31250000186264515, + "step": 60 + }, + { + "advantage_max": 1.8871331065893173, + "advantage_mean": 1.738468902168222e-08, + "advantage_min": -0.7573140487074852, + "advantage_std": 0.9997683763504028, + "completion_length": 3093.854217529297, + "epoch": 0.06971428571428571, + "grad_norm": 0.15688744187355042, + "kl": 0.00017762184143066406, + "lambda_div_used": 0.6, + "learning_rate": 9.98673738502114e-07, + "loss": 0.0, + "reward": -0.11309731751680374, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11309731751680374, + "reward_after_std": 0.5219951793551445, + "reward_before_mean": 0.26111954916268587, + "reward_before_std": 0.4263201951980591, + "reward_change_max": 0.00035150349140167236, + "reward_change_mean": -0.3742168480530381, + "reward_change_min": -0.591254610568285, + "reward_change_std": 0.23080396559089422, + "reward_std": 0.5219952017068863, + "rewards/cosine_scaled_reward": -0.10902357054874301, + "rewards/format_reward": 0.4791666753590107, + "step": 61 + }, + { + "advantage_max": 1.852844849228859, + "advantage_mean": -1.459072043741294e-08, + "advantage_min": -0.884965643286705, + "advantage_std": 0.9998951032757759, + "completion_length": 2501.291748046875, + "epoch": 0.07085714285714285, + "grad_norm": 0.21886730194091797, + "kl": 0.0006071189418435097, + "lambda_div_used": 0.6, + "learning_rate": 9.98421786662277e-07, + "loss": 0.0, + "reward": 0.2706692605279386, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2706692605279386, + "reward_after_std": 1.1487024202942848, + "reward_before_mean": 0.7268138364888728, + "reward_before_std": 1.1312339007854462, + "reward_change_max": 0.0002567693591117859, + "reward_change_mean": -0.45614459551870823, + "reward_change_min": -0.9497322998940945, + "reward_change_std": 0.35773101449012756, + "reward_std": 1.1487024575471878, + "rewards/cosine_scaled_reward": 0.05090692872181535, + "rewards/format_reward": 0.6250000055879354, + "step": 62 + }, + { + "advantage_max": 1.7459149807691574, + "advantage_mean": -7.450581207546492e-09, + "advantage_min": -0.9959862679243088, + "advantage_std": 0.9998425021767616, + "completion_length": 2272.5208892822266, + "epoch": 0.072, + "grad_norm": 0.2128225415945053, + "kl": 0.0005265623331069946, + "lambda_div_used": 0.6, + "learning_rate": 9.981479793771866e-07, + "loss": 0.0, + "reward": 0.3422083929181099, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3422083929181099, + "reward_after_std": 0.9376309607177973, + "reward_before_mean": 0.9017110355198383, + "reward_before_std": 0.9917434807866812, + "reward_change_max": 0.00043205171823501587, + "reward_change_mean": -0.559502637013793, + "reward_change_min": -1.0579553097486496, + "reward_change_std": 0.43955489341169596, + "reward_std": 0.9376309644430876, + "rewards/cosine_scaled_reward": 0.07585549168288708, + "rewards/format_reward": 0.7500000149011612, + "step": 63 + }, + { + "advantage_max": 1.7108050882816315, + "advantage_mean": -3.104408607956799e-08, + "advantage_min": -1.0726191624999046, + "advantage_std": 0.999819926917553, + "completion_length": 2815.2083892822266, + "epoch": 0.07314285714285715, + "grad_norm": 0.18735727667808533, + "kl": 0.00014835596084594727, + "lambda_div_used": 0.6, + "learning_rate": 9.97852329991824e-07, + "loss": 0.0, + "reward": 0.1047421507537365, + "reward_advantage_correlation": 0.9999999999999994, + "reward_after_mean": 0.1047421507537365, + "reward_after_std": 0.6933653056621552, + "reward_before_mean": 0.5733803249895573, + "reward_before_std": 0.703533660620451, + "reward_change_max": 0.0035287141799926758, + "reward_change_mean": -0.46863821521401405, + "reward_change_min": -0.8130603022873402, + "reward_change_std": 0.33382952213287354, + "reward_std": 0.6933653354644775, + "rewards/cosine_scaled_reward": 0.057523492723703384, + "rewards/format_reward": 0.4583333469927311, + "step": 64 + }, + { + "advantage_max": 1.901139497756958, + "advantage_mean": -1.4901161193847656e-08, + "advantage_min": -0.7846154049038887, + "advantage_std": 0.9998325034976006, + "completion_length": 2725.645881652832, + "epoch": 0.07428571428571429, + "grad_norm": 0.20273591578006744, + "kl": 0.00024253875017166138, + "lambda_div_used": 0.6, + "learning_rate": 9.975348529157229e-07, + "loss": 0.0, + "reward": -0.02569293975830078, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.02569293975830078, + "reward_after_std": 0.7628384791314602, + "reward_before_mean": 0.34538572560995817, + "reward_before_std": 0.7073428109288216, + "reward_change_max": 0.0, + "reward_change_mean": -0.371078678406775, + "reward_change_min": -0.7337826155126095, + "reward_change_std": 0.275781849399209, + "reward_std": 0.7628384865820408, + "rewards/cosine_scaled_reward": -0.07730714417994022, + "rewards/format_reward": 0.5000000055879354, + "step": 65 + }, + { + "advantage_max": 1.8033942729234695, + "advantage_mean": 8.071463331038586e-09, + "advantage_min": -0.8609968945384026, + "advantage_std": 0.9998007789254189, + "completion_length": 2064.6250076293945, + "epoch": 0.07542857142857143, + "grad_norm": 0.24309836328029633, + "kl": 0.0003443807363510132, + "lambda_div_used": 0.6, + "learning_rate": 9.971955636222684e-07, + "loss": 0.0, + "reward": 0.10103908181190491, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10103908181190491, + "reward_after_std": 0.6284786872565746, + "reward_before_mean": 0.5767052434384823, + "reward_before_std": 0.5687633650377393, + "reward_change_max": 0.0008361414074897766, + "reward_change_mean": -0.47566617419943213, + "reward_change_min": -0.8255947642028332, + "reward_change_std": 0.3140420475974679, + "reward_std": 0.6284786984324455, + "rewards/cosine_scaled_reward": 0.038352612406015396, + "rewards/format_reward": 0.5, + "step": 66 + }, + { + "advantage_max": 1.7674315869808197, + "advantage_mean": 1.614292477469803e-08, + "advantage_min": -0.938122421503067, + "advantage_std": 0.9997090101242065, + "completion_length": 3382.8541870117188, + "epoch": 0.07657142857142857, + "grad_norm": 0.13286292552947998, + "kl": 0.0003947615623474121, + "lambda_div_used": 0.6, + "learning_rate": 9.968344786479415e-07, + "loss": 0.0, + "reward": -0.4611010178923607, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.4611010178923607, + "reward_after_std": 0.46187240816652775, + "reward_before_mean": -0.26146249906742014, + "reward_before_std": 0.4861889239400625, + "reward_change_max": 0.0009711086750030518, + "reward_change_mean": -0.19963854388333857, + "reward_change_min": -0.45800918713212013, + "reward_change_std": 0.1788791799917817, + "reward_std": 0.4618724286556244, + "rewards/cosine_scaled_reward": -0.2348979152739048, + "rewards/format_reward": 0.2083333358168602, + "step": 67 + }, + { + "advantage_max": 1.8469493240118027, + "advantage_mean": 2.483527050678447e-09, + "advantage_min": -0.8409870713949203, + "advantage_std": 0.9998388886451721, + "completion_length": 2142.208381652832, + "epoch": 0.07771428571428571, + "grad_norm": 0.2571074366569519, + "kl": 0.0017017126083374023, + "lambda_div_used": 0.6, + "learning_rate": 9.964516155915151e-07, + "loss": 0.0001, + "reward": 0.0018315929919481277, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.0018315929919481277, + "reward_after_std": 0.8450445830821991, + "reward_before_mean": 0.3761431612074375, + "reward_before_std": 0.8380554802715778, + "reward_change_max": 0.0017983242869377136, + "reward_change_mean": -0.37431156635284424, + "reward_change_min": -0.8194547779858112, + "reward_change_std": 0.30843253154307604, + "reward_std": 0.8450446091592312, + "rewards/cosine_scaled_reward": -0.11401176685467362, + "rewards/format_reward": 0.6041666716337204, + "step": 68 + }, + { + "advantage_max": 1.8372850269079208, + "advantage_mean": 4.0667754774847964e-08, + "advantage_min": -0.8896334916353226, + "advantage_std": 0.9997574761509895, + "completion_length": 2471.770881652832, + "epoch": 0.07885714285714286, + "grad_norm": 0.2565942108631134, + "kl": 0.001112222671508789, + "lambda_div_used": 0.6, + "learning_rate": 9.960469931131936e-07, + "loss": 0.0, + "reward": -0.27644870735821314, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.27644870735821314, + "reward_after_std": 0.5243825484067202, + "reward_before_mean": 0.007322388701140881, + "reward_before_std": 0.5041719619184732, + "reward_change_max": 0.0011881962418556213, + "reward_change_mean": -0.2837710939347744, + "reward_change_min": -0.5716921575367451, + "reward_change_std": 0.21774671506136656, + "reward_std": 0.5243825595825911, + "rewards/cosine_scaled_reward": -0.23592214786913246, + "rewards/format_reward": 0.4791666679084301, + "step": 69 + }, + { + "advantage_max": 1.8310918658971786, + "advantage_mean": 4.346172166602713e-08, + "advantage_min": -0.7979481443762779, + "advantage_std": 0.9997915923595428, + "completion_length": 3018.1458587646484, + "epoch": 0.08, + "grad_norm": 0.16229884326457977, + "kl": 0.0007503684610128403, + "lambda_div_used": 0.6, + "learning_rate": 9.956206309337066e-07, + "loss": 0.0, + "reward": -0.2695963028818369, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.2695963028818369, + "reward_after_std": 0.6270924657583237, + "reward_before_mean": -0.003460092470049858, + "reward_before_std": 0.6113432552665472, + "reward_change_max": 0.0, + "reward_change_mean": -0.2661362048238516, + "reward_change_min": -0.5505039319396019, + "reward_change_std": 0.22055453341454268, + "reward_std": 0.6270924992859364, + "rewards/cosine_scaled_reward": -0.15798005042597651, + "rewards/format_reward": 0.31250000186264515, + "step": 70 + }, + { + "advantage_max": 1.70066799223423, + "advantage_mean": 5.091230215192866e-08, + "advantage_min": -0.9549814835190773, + "advantage_std": 0.9997783005237579, + "completion_length": 2823.1666946411133, + "epoch": 0.08114285714285714, + "grad_norm": 0.24718300998210907, + "kl": 0.0011773109436035156, + "lambda_div_used": 0.6, + "learning_rate": 9.951725498333448e-07, + "loss": 0.0, + "reward": -0.15892398823052645, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.15892398823052645, + "reward_after_std": 0.6443887799978256, + "reward_before_mean": 0.17340507730841637, + "reward_before_std": 0.6603858508169651, + "reward_change_max": 0.0036379098892211914, + "reward_change_mean": -0.33232905343174934, + "reward_change_min": -0.6937352381646633, + "reward_change_std": 0.27722177281975746, + "reward_std": 0.6443887986242771, + "rewards/cosine_scaled_reward": -0.06954746786504984, + "rewards/format_reward": 0.3125, + "step": 71 + }, + { + "advantage_max": 1.7955728471279144, + "advantage_mean": 2.6930746410691597e-08, + "advantage_min": -0.9999164938926697, + "advantage_std": 0.9997884854674339, + "completion_length": 2870.666717529297, + "epoch": 0.08228571428571428, + "grad_norm": 0.20824484527111053, + "kl": 0.0013428330421447754, + "lambda_div_used": 0.6, + "learning_rate": 9.947027716509488e-07, + "loss": 0.0001, + "reward": -0.32331820391118526, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.32331820391118526, + "reward_after_std": 0.5827282182872295, + "reward_before_mean": -0.07646113401278853, + "reward_before_std": 0.5897139385342598, + "reward_change_max": 0.0016376525163650513, + "reward_change_mean": -0.2468570563942194, + "reward_change_min": -0.4943818226456642, + "reward_change_std": 0.20523380488157272, + "reward_std": 0.5827282220125198, + "rewards/cosine_scaled_reward": -0.21531390957534313, + "rewards/format_reward": 0.354166679084301, + "step": 72 + }, + { + "advantage_max": 1.773019254207611, + "advantage_mean": 5.494803217986899e-08, + "advantage_min": -0.9967968463897705, + "advantage_std": 0.999751403927803, + "completion_length": 3529.7916870117188, + "epoch": 0.08342857142857144, + "grad_norm": 0.14700362086296082, + "kl": 0.00020443648099899292, + "lambda_div_used": 0.6, + "learning_rate": 9.942113192828444e-07, + "loss": 0.0, + "reward": -0.40370890498161316, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.40370890498161316, + "reward_after_std": 0.5996265262365341, + "reward_before_mean": -0.20144187100231647, + "reward_before_std": 0.6425512656569481, + "reward_change_max": 0.0006123930215835571, + "reward_change_mean": -0.2022670367732644, + "reward_change_min": -0.49701910093426704, + "reward_change_std": 0.2075851233676076, + "reward_std": 0.5996265448629856, + "rewards/cosine_scaled_reward": -0.15280426386743784, + "rewards/format_reward": 0.1041666679084301, + "step": 73 + }, + { + "advantage_max": 1.9407240599393845, + "advantage_mean": 1.3659398390153399e-08, + "advantage_min": -0.7461787275969982, + "advantage_std": 0.9998097345232964, + "completion_length": 3243.229217529297, + "epoch": 0.08457142857142858, + "grad_norm": 0.17531763017177582, + "kl": 0.0008713230490684509, + "lambda_div_used": 0.6, + "learning_rate": 9.93698216681727e-07, + "loss": 0.0, + "reward": -0.09746089670807123, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.09746089670807123, + "reward_after_std": 0.8146995007991791, + "reward_before_mean": 0.2181630413979292, + "reward_before_std": 0.7365536075085402, + "reward_change_max": 0.0005424246191978455, + "reward_change_mean": -0.31562393717467785, + "reward_change_min": -0.53596480935812, + "reward_change_std": 0.21479328256100416, + "reward_std": 0.8146995343267918, + "rewards/cosine_scaled_reward": -0.015918486984446645, + "rewards/format_reward": 0.2500000037252903, + "step": 74 + }, + { + "advantage_max": 1.8912398666143417, + "advantage_mean": 9.93410831373609e-09, + "advantage_min": -0.7490442171692848, + "advantage_std": 0.9998228922486305, + "completion_length": 2986.625045776367, + "epoch": 0.08571428571428572, + "grad_norm": 0.15843676030635834, + "kl": 0.0008603427559137344, + "lambda_div_used": 0.6, + "learning_rate": 9.931634888554935e-07, + "loss": 0.0, + "reward": 0.004519036039710045, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.004519036039710045, + "reward_after_std": 0.7009322084486485, + "reward_before_mean": 0.40569755621254444, + "reward_before_std": 0.6009742096066475, + "reward_change_max": 0.00026772916316986084, + "reward_change_mean": -0.40117852203547955, + "reward_change_min": -0.6780325099825859, + "reward_change_std": 0.2633136510848999, + "reward_std": 0.7009322196245193, + "rewards/cosine_scaled_reward": 0.0153487678617239, + "rewards/format_reward": 0.3750000037252903, + "step": 75 + }, + { + "advantage_max": 1.8082723319530487, + "advantage_mean": 3.0423204899765466e-08, + "advantage_min": -0.9330036044120789, + "advantage_std": 0.9997623041272163, + "completion_length": 3001.8541870117188, + "epoch": 0.08685714285714285, + "grad_norm": 0.20225301384925842, + "kl": 0.00020218640565872192, + "lambda_div_used": 0.6, + "learning_rate": 9.926071618660237e-07, + "loss": 0.0, + "reward": -0.1935147661715746, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1935147661715746, + "reward_after_std": 0.45791828259825706, + "reward_before_mean": 0.1532026305794716, + "reward_before_std": 0.4037418030202389, + "reward_change_max": 0.0013406574726104736, + "reward_change_mean": -0.34671736136078835, + "reward_change_min": -0.5875682011246681, + "reward_change_std": 0.2296199956908822, + "reward_std": 0.45791828632354736, + "rewards/cosine_scaled_reward": -0.1317320466041565, + "rewards/format_reward": 0.41666667349636555, + "step": 76 + }, + { + "advantage_max": 1.8107152730226517, + "advantage_mean": 3.849466734262563e-08, + "advantage_min": -0.9610567763447762, + "advantage_std": 0.9997978210449219, + "completion_length": 3046.5416870117188, + "epoch": 0.088, + "grad_norm": 0.16308292746543884, + "kl": 0.0002619922161102295, + "lambda_div_used": 0.6, + "learning_rate": 9.9202926282791e-07, + "loss": 0.0, + "reward": -0.2058363500982523, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2058363500982523, + "reward_after_std": 0.5894700028002262, + "reward_before_mean": 0.10926695168018341, + "reward_before_std": 0.5918228011578321, + "reward_change_max": 0.0014843419194221497, + "reward_change_mean": -0.3151032840833068, + "reward_change_min": -0.5777905434370041, + "reward_change_std": 0.23432088736444712, + "reward_std": 0.5894700065255165, + "rewards/cosine_scaled_reward": -0.13286652602255344, + "rewards/format_reward": 0.37500001676380634, + "step": 77 + }, + { + "advantage_max": 1.810091182589531, + "advantage_mean": 3.04232042891428e-08, + "advantage_min": -1.0055750012397766, + "advantage_std": 0.9997722059488297, + "completion_length": 3185.8750610351562, + "epoch": 0.08914285714285715, + "grad_norm": 0.17335101962089539, + "kl": 0.00017173215746879578, + "lambda_div_used": 0.6, + "learning_rate": 9.91429819907136e-07, + "loss": 0.0, + "reward": -0.12524622678756714, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.12524622678756714, + "reward_after_std": 0.6566737592220306, + "reward_before_mean": 0.2210826389491558, + "reward_before_std": 0.6558897197246552, + "reward_change_max": 0.0002913177013397217, + "reward_change_mean": -0.3463288554921746, + "reward_change_min": -0.6442321985960007, + "reward_change_std": 0.26683663809672, + "reward_std": 0.6566737834364176, + "rewards/cosine_scaled_reward": -0.06654200842604041, + "rewards/format_reward": 0.3541666753590107, + "step": 78 + }, + { + "advantage_max": 1.8899707645177841, + "advantage_mean": 1.0244548626081595e-08, + "advantage_min": -0.825028270483017, + "advantage_std": 0.9998388290405273, + "completion_length": 2240.5625228881836, + "epoch": 0.09028571428571429, + "grad_norm": 0.24236978590488434, + "kl": 0.0011141449213027954, + "lambda_div_used": 0.6, + "learning_rate": 9.908088623197048e-07, + "loss": 0.0, + "reward": 0.07283397391438484, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.07283397391438484, + "reward_after_std": 0.7937607131898403, + "reward_before_mean": 0.4887027923250571, + "reward_before_std": 0.7088694777339697, + "reward_change_max": 0.00013114511966705322, + "reward_change_mean": -0.4158688234165311, + "reward_change_min": -0.7517045177519321, + "reward_change_std": 0.2835696069523692, + "reward_std": 0.7937607355415821, + "rewards/cosine_scaled_reward": -0.05773193761706352, + "rewards/format_reward": 0.6041666697710752, + "step": 79 + }, + { + "advantage_max": 1.8568548262119293, + "advantage_mean": -1.614292521878724e-08, + "advantage_min": -0.837759755551815, + "advantage_std": 0.9997959807515144, + "completion_length": 3219.875030517578, + "epoch": 0.09142857142857143, + "grad_norm": 0.1928330361843109, + "kl": 0.0007643923163414001, + "lambda_div_used": 0.6, + "learning_rate": 9.901664203302124e-07, + "loss": 0.0, + "reward": -0.28942321287468076, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.28942321287468076, + "reward_after_std": 0.7306128852069378, + "reward_before_mean": -0.057574108242988586, + "reward_before_std": 0.7140483744442463, + "reward_change_max": 0.0010956153273582458, + "reward_change_mean": -0.2318491260521114, + "reward_change_min": -0.5317598357796669, + "reward_change_std": 0.19940282963216305, + "reward_std": 0.7306128889322281, + "rewards/cosine_scaled_reward": -0.15378706716001034, + "rewards/format_reward": 0.2500000037252903, + "step": 80 + }, + { + "advantage_max": 1.7959967106580734, + "advantage_mean": -1.1796753740522803e-08, + "advantage_min": -0.9497917145490646, + "advantage_std": 0.9997836574912071, + "completion_length": 3129.7083435058594, + "epoch": 0.09257142857142857, + "grad_norm": 0.27863961458206177, + "kl": 0.002306640148162842, + "lambda_div_used": 0.6, + "learning_rate": 9.895025252503755e-07, + "loss": 0.0001, + "reward": -0.2474798383191228, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.2474798383191228, + "reward_after_std": 0.5966678149998188, + "reward_before_mean": 0.03739765891805291, + "reward_before_std": 0.5835473164916039, + "reward_change_max": 0.001976579427719116, + "reward_change_mean": -0.2848775088787079, + "reward_change_min": -0.5357042364776134, + "reward_change_std": 0.21301922667771578, + "reward_std": 0.5966678410768509, + "rewards/cosine_scaled_reward": -0.1479678377509117, + "rewards/format_reward": 0.3333333358168602, + "step": 81 + }, + { + "advantage_max": 1.8437489867210388, + "advantage_mean": 4.967054767490708e-09, + "advantage_min": -0.8466271758079529, + "advantage_std": 0.9998117461800575, + "completion_length": 2898.0208587646484, + "epoch": 0.09371428571428571, + "grad_norm": 0.18406398594379425, + "kl": 0.0025584548711776733, + "lambda_div_used": 0.6, + "learning_rate": 9.888172094375033e-07, + "loss": 0.0001, + "reward": -0.016393298865295947, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.016393298865295947, + "reward_after_std": 0.7960129491984844, + "reward_before_mean": 0.35885727778077126, + "reward_before_std": 0.7870577052235603, + "reward_change_max": 0.0004786178469657898, + "reward_change_mean": -0.37525059189647436, + "reward_change_min": -0.7115802094340324, + "reward_change_std": 0.2880655792541802, + "reward_std": 0.7960129864513874, + "rewards/cosine_scaled_reward": 0.0023453044705092907, + "rewards/format_reward": 0.35416666977107525, + "step": 82 + }, + { + "advantage_max": 1.7741630524396896, + "advantage_mean": 1.9868215961338365e-08, + "advantage_min": -1.0308396220207214, + "advantage_std": 0.9997642710804939, + "completion_length": 2784.2916870117188, + "epoch": 0.09485714285714286, + "grad_norm": 0.2609867751598358, + "kl": 0.0011954903602600098, + "lambda_div_used": 0.6, + "learning_rate": 9.881105062929221e-07, + "loss": 0.0, + "reward": -0.2998163793236017, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2998163793236017, + "reward_after_std": 0.4641554616391659, + "reward_before_mean": -0.013990622013807297, + "reward_before_std": 0.4607525132596493, + "reward_change_max": 0.0005547106266021729, + "reward_change_mean": -0.2858257554471493, + "reward_change_min": -0.5067390538752079, + "reward_change_std": 0.2103829812258482, + "reward_std": 0.4641554690897465, + "rewards/cosine_scaled_reward": -0.1632453203201294, + "rewards/format_reward": 0.31250000186264515, + "step": 83 + }, + { + "advantage_max": 1.7930245250463486, + "advantage_mean": 3.104408841103634e-09, + "advantage_min": -0.8693301230669022, + "advantage_std": 0.9997679516673088, + "completion_length": 3040.5000534057617, + "epoch": 0.096, + "grad_norm": 0.1649995744228363, + "kl": 0.0007286667823791504, + "lambda_div_used": 0.6, + "learning_rate": 9.873824502603459e-07, + "loss": 0.0, + "reward": -0.07544983178377151, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.07544983178377151, + "reward_after_std": 0.7895570639520884, + "reward_before_mean": 0.27497592754662037, + "reward_before_std": 0.830137187615037, + "reward_change_max": 0.0006811842322349548, + "reward_change_mean": -0.35042573790997267, + "reward_change_min": -0.7961104810237885, + "reward_change_std": 0.3109002844430506, + "reward_std": 0.7895570639520884, + "rewards/cosine_scaled_reward": -0.060428719967603683, + "rewards/format_reward": 0.39583334140479565, + "step": 84 + }, + { + "advantage_max": 1.888422742486, + "advantage_mean": 6.208817349140361e-09, + "advantage_min": -0.79921755194664, + "advantage_std": 0.9998757466673851, + "completion_length": 3134.6250610351562, + "epoch": 0.09714285714285714, + "grad_norm": 0.168262779712677, + "kl": 0.00031495094299316406, + "lambda_div_used": 0.6, + "learning_rate": 9.866330768241983e-07, + "loss": 0.0, + "reward": -0.060326272854581475, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.060326272854581475, + "reward_after_std": 1.0223791375756264, + "reward_before_mean": 0.23499422008171678, + "reward_before_std": 0.9955884180963039, + "reward_change_max": 0.0014316290616989136, + "reward_change_mean": -0.2953204959630966, + "reward_change_min": -0.6588696278631687, + "reward_change_std": 0.255947170779109, + "reward_std": 1.0223791673779488, + "rewards/cosine_scaled_reward": -0.0804195562377572, + "rewards/format_reward": 0.39583333767950535, + "step": 85 + }, + { + "advantage_max": 1.8527948707342148, + "advantage_mean": 1.9868215850316062e-08, + "advantage_min": -0.7902801856398582, + "advantage_std": 0.9998051598668098, + "completion_length": 2867.5625, + "epoch": 0.09828571428571428, + "grad_norm": 0.17173554003238678, + "kl": 0.0012168288230895996, + "lambda_div_used": 0.6, + "learning_rate": 9.85862422507884e-07, + "loss": 0.0, + "reward": -0.15069702547043562, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.15069702547043562, + "reward_after_std": 0.7066328749060631, + "reward_before_mean": 0.1656637266278267, + "reward_before_std": 0.6682115383446217, + "reward_change_max": 0.0008284449577331543, + "reward_change_mean": -0.31636073999106884, + "reward_change_min": -0.6082081608474255, + "reward_change_std": 0.24262152425944805, + "reward_std": 0.706632886081934, + "rewards/cosine_scaled_reward": -0.14633481635246426, + "rewards/format_reward": 0.4583333358168602, + "step": 86 + }, + { + "advantage_max": 1.8305644243955612, + "advantage_mean": -2.173085600354341e-09, + "advantage_min": -0.8854763805866241, + "advantage_std": 0.9998243600130081, + "completion_length": 2582.4375534057617, + "epoch": 0.09942857142857142, + "grad_norm": 0.23738721013069153, + "kl": 0.002800785005092621, + "lambda_div_used": 0.6, + "learning_rate": 9.850705248720068e-07, + "loss": 0.0001, + "reward": 0.013825018191710114, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.013825018191710114, + "reward_after_std": 0.6578258685767651, + "reward_before_mean": 0.4317212179303169, + "reward_before_std": 0.6047324761748314, + "reward_change_max": 0.0030826181173324585, + "reward_change_mean": -0.41789623629301786, + "reward_change_min": -0.7545126043260098, + "reward_change_std": 0.2849441980943084, + "reward_std": 0.6578258872032166, + "rewards/cosine_scaled_reward": -0.05497271195054054, + "rewards/format_reward": 0.541666679084301, + "step": 87 + }, + { + "advantage_max": 1.8509272336959839, + "advantage_mean": -5.4637592616924024e-08, + "advantage_min": -0.884626179933548, + "advantage_std": 0.99986432492733, + "completion_length": 2470.08341217041, + "epoch": 0.10057142857142858, + "grad_norm": 0.2240888774394989, + "kl": 0.0013819336891174316, + "lambda_div_used": 0.6, + "learning_rate": 9.8425742251254e-07, + "loss": 0.0001, + "reward": 0.37665122747421265, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.37665122747421265, + "reward_after_std": 0.867752481251955, + "reward_before_mean": 0.9529164787381887, + "reward_before_std": 0.7872258014976978, + "reward_change_max": 0.0012845396995544434, + "reward_change_mean": -0.5762653043493629, + "reward_change_min": -0.9866040423512459, + "reward_change_std": 0.3834084654226899, + "reward_std": 0.8677525036036968, + "rewards/cosine_scaled_reward": 0.1535415492253378, + "rewards/format_reward": 0.6458333376795053, + "step": 88 + }, + { + "advantage_max": 1.7756440043449402, + "advantage_mean": 3.880510621168121e-09, + "advantage_min": -0.9730030819773674, + "advantage_std": 0.9998276829719543, + "completion_length": 3276.979217529297, + "epoch": 0.10171428571428572, + "grad_norm": 0.22678594291210175, + "kl": 0.0013582706451416016, + "lambda_div_used": 0.6, + "learning_rate": 9.83423155058946e-07, + "loss": 0.0001, + "reward": -0.16686965757980943, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.16686965757980943, + "reward_after_std": 0.7788497358560562, + "reward_before_mean": 0.127953777089715, + "reward_before_std": 0.8071900904178619, + "reward_change_max": 0.001987569034099579, + "reward_change_mean": -0.29482342256233096, + "reward_change_min": -0.643274899572134, + "reward_change_std": 0.2588015152141452, + "reward_std": 0.7788497470319271, + "rewards/cosine_scaled_reward": -0.08185644680634141, + "rewards/format_reward": 0.2916666753590107, + "step": 89 + }, + { + "advantage_max": 1.9292222708463669, + "advantage_mean": 2.6077032533322608e-08, + "advantage_min": -0.7419086024165154, + "advantage_std": 0.9997998923063278, + "completion_length": 2387.083366394043, + "epoch": 0.10285714285714286, + "grad_norm": 0.37300607562065125, + "kl": 0.0021688640117645264, + "lambda_div_used": 0.6, + "learning_rate": 9.825677631722435e-07, + "loss": 0.0001, + "reward": -0.13474958203732967, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.13474958203732967, + "reward_after_std": 0.7872820869088173, + "reward_before_mean": 0.16399362310767174, + "reward_before_std": 0.7284187152981758, + "reward_change_max": 0.0013339966535568237, + "reward_change_mean": -0.2987432088702917, + "reward_change_min": -0.5778307020664215, + "reward_change_std": 0.22088597994297743, + "reward_std": 0.78728212043643, + "rewards/cosine_scaled_reward": -0.178419857596964, + "rewards/format_reward": 0.5208333395421505, + "step": 90 + }, + { + "advantage_max": 1.7450472861528397, + "advantage_mean": 2.4059167547108018e-08, + "advantage_min": -1.000647023320198, + "advantage_std": 0.9997962266206741, + "completion_length": 3159.437530517578, + "epoch": 0.104, + "grad_norm": 0.1630987524986267, + "kl": 0.0008542463183403015, + "lambda_div_used": 0.6, + "learning_rate": 9.816912885430258e-07, + "loss": 0.0, + "reward": -0.18008676916360855, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.18008676916360855, + "reward_after_std": 0.6187229715287685, + "reward_before_mean": 0.14918549545109272, + "reward_before_std": 0.6627273000776768, + "reward_change_max": 0.0004763081669807434, + "reward_change_mean": -0.3292722823098302, + "reward_change_min": -0.6484544016420841, + "reward_change_std": 0.2739113047719002, + "reward_std": 0.6187229789793491, + "rewards/cosine_scaled_reward": -0.11290724948048592, + "rewards/format_reward": 0.37500001676380634, + "step": 91 + }, + { + "advantage_max": 1.8468924760818481, + "advantage_mean": -3.104408841103634e-09, + "advantage_min": -0.9322610720992088, + "advantage_std": 0.9997585564851761, + "completion_length": 2569.8333892822266, + "epoch": 0.10514285714285715, + "grad_norm": 0.25294139981269836, + "kl": 0.0018187016248703003, + "lambda_div_used": 0.6, + "learning_rate": 9.807937738894303e-07, + "loss": 0.0001, + "reward": -0.22114436700940132, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.22114436700940132, + "reward_after_std": 0.5883230771869421, + "reward_before_mean": 0.08094780705869198, + "reward_before_std": 0.558818282559514, + "reward_change_max": 1.0579824447631836e-05, + "reward_change_mean": -0.302092173602432, + "reward_change_min": -0.6010795347392559, + "reward_change_std": 0.22603166941553354, + "reward_std": 0.5883230995386839, + "rewards/cosine_scaled_reward": -0.1886927718296647, + "rewards/format_reward": 0.45833334513008595, + "step": 92 + }, + { + "advantage_max": 1.7818111181259155, + "advantage_mean": 3.787378544117814e-08, + "advantage_min": -0.8869654014706612, + "advantage_std": 0.999727226793766, + "completion_length": 3536.9791870117188, + "epoch": 0.10628571428571429, + "grad_norm": 0.1902371197938919, + "kl": 0.0012578368186950684, + "lambda_div_used": 0.6, + "learning_rate": 9.798752629550546e-07, + "loss": 0.0001, + "reward": -0.6223237328231335, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.6223237328231335, + "reward_after_std": 0.3858645409345627, + "reward_before_mean": -0.4991328977048397, + "reward_before_std": 0.409189336001873, + "reward_change_max": 0.003185272216796875, + "reward_change_mean": -0.1231908411718905, + "reward_change_min": -0.2996739409863949, + "reward_change_std": 0.12465427769348025, + "reward_std": 0.3858645521104336, + "rewards/cosine_scaled_reward": -0.2703997865319252, + "rewards/format_reward": 0.0416666679084301, + "step": 93 + }, + { + "advantage_max": 1.889383926987648, + "advantage_mean": -1.4901161082825354e-08, + "advantage_min": -0.879096981137991, + "advantage_std": 0.999787226319313, + "completion_length": 3138.395866394043, + "epoch": 0.10742857142857143, + "grad_norm": 0.22059182822704315, + "kl": 0.001993924379348755, + "lambda_div_used": 0.6, + "learning_rate": 9.78935800506826e-07, + "loss": 0.0001, + "reward": -0.1963297836482525, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1963297836482525, + "reward_after_std": 0.5589430667459965, + "reward_before_mean": 0.12045633129309863, + "reward_before_std": 0.4760522823780775, + "reward_change_max": 0.0003600567579269409, + "reward_change_mean": -0.3167861073743552, + "reward_change_min": -0.4744899459183216, + "reward_change_std": 0.20194733957760036, + "reward_std": 0.5589430965483189, + "rewards/cosine_scaled_reward": -0.06477185152471066, + "rewards/format_reward": 0.2500000037252903, + "step": 94 + }, + { + "advantage_max": 1.7795635610818863, + "advantage_mean": 5.4948036121160726e-08, + "advantage_min": -0.9644187763333321, + "advantage_std": 0.9996979087591171, + "completion_length": 3409.187530517578, + "epoch": 0.10857142857142857, + "grad_norm": 0.19662852585315704, + "kl": 0.00038304924964904785, + "lambda_div_used": 0.6, + "learning_rate": 9.779754323328192e-07, + "loss": 0.0, + "reward": -0.594087558798492, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.594087558798492, + "reward_after_std": 0.36136773228645325, + "reward_before_mean": -0.44723618403077126, + "reward_before_std": 0.3845880515873432, + "reward_change_max": 0.0, + "reward_change_mean": -0.14685136824846268, + "reward_change_min": -0.3474042974412441, + "reward_change_std": 0.13843790628015995, + "reward_std": 0.36136773973703384, + "rewards/cosine_scaled_reward": -0.2757014315575361, + "rewards/format_reward": 0.10416666977107525, + "step": 95 + }, + { + "advantage_max": 1.834953397512436, + "advantage_mean": 9.623667196478891e-09, + "advantage_min": -0.9561122506856918, + "advantage_std": 0.999815046787262, + "completion_length": 2898.5, + "epoch": 0.10971428571428571, + "grad_norm": 0.19801871478557587, + "kl": 0.0019068419933319092, + "lambda_div_used": 0.6, + "learning_rate": 9.769942052400235e-07, + "loss": 0.0001, + "reward": -0.05706032179296017, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05706032179296017, + "reward_after_std": 0.7747535556554794, + "reward_before_mean": 0.2972187213599682, + "reward_before_std": 0.755788192152977, + "reward_change_max": 0.0006535649299621582, + "reward_change_mean": -0.3542790412902832, + "reward_change_min": -0.6666507758200169, + "reward_change_std": 0.2610515356063843, + "reward_std": 0.7747535966336727, + "rewards/cosine_scaled_reward": -0.02847396954894066, + "rewards/format_reward": 0.3541666753590107, + "step": 96 + }, + { + "advantage_max": 1.7371751815080643, + "advantage_mean": -6.208815683805824e-10, + "advantage_min": -1.0732538476586342, + "advantage_std": 0.9997835084795952, + "completion_length": 3111.3334045410156, + "epoch": 0.11085714285714286, + "grad_norm": 0.21298745274543762, + "kl": 0.000819966197013855, + "lambda_div_used": 0.6, + "learning_rate": 9.759921670520634e-07, + "loss": 0.0, + "reward": -0.12749477475881577, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.12749477475881577, + "reward_after_std": 0.6070926785469055, + "reward_before_mean": 0.2340460466220975, + "reward_before_std": 0.6491605900228024, + "reward_change_max": 0.0008164122700691223, + "reward_change_mean": -0.36154082510620356, + "reward_change_min": -0.658284705132246, + "reward_change_std": 0.2894742302596569, + "reward_std": 0.6070926897227764, + "rewards/cosine_scaled_reward": -0.04964364320039749, + "rewards/format_reward": 0.33333334513008595, + "step": 97 + }, + { + "advantage_max": 1.8331523686647415, + "advantage_mean": 6.33299377383878e-08, + "advantage_min": -0.9393685981631279, + "advantage_std": 0.9997893199324608, + "completion_length": 2636.229232788086, + "epoch": 0.112, + "grad_norm": 0.19141905009746552, + "kl": 0.0006305575370788574, + "lambda_div_used": 0.6, + "learning_rate": 9.749693666068663e-07, + "loss": 0.0, + "reward": -0.04186771437525749, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.04186771437525749, + "reward_after_std": 0.6713929902762175, + "reward_before_mean": 0.34054063144139946, + "reward_before_std": 0.6036591455340385, + "reward_change_max": 0.0006186738610267639, + "reward_change_mean": -0.3824083199724555, + "reward_change_min": -0.640322033315897, + "reward_change_std": 0.25615993700921535, + "reward_std": 0.6713930163532495, + "rewards/cosine_scaled_reward": -0.09014636965002865, + "rewards/format_reward": 0.5208333488553762, + "step": 98 + }, + { + "advantage_max": 1.785209521651268, + "advantage_mean": 4.0357311326122414e-08, + "advantage_min": -0.9890889748930931, + "advantage_std": 0.9997483119368553, + "completion_length": 2831.1041717529297, + "epoch": 0.11314285714285714, + "grad_norm": 0.18398259580135345, + "kl": 0.0009243488311767578, + "lambda_div_used": 0.6, + "learning_rate": 9.739258537542835e-07, + "loss": 0.0, + "reward": -0.15700703021138906, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.15700703021138906, + "reward_after_std": 0.5350757166743279, + "reward_before_mean": 0.19414901733398438, + "reward_before_std": 0.488203302025795, + "reward_change_max": 0.0009380653500556946, + "reward_change_mean": -0.3511560335755348, + "reward_change_min": -0.6369931064546108, + "reward_change_std": 0.2508567860350013, + "reward_std": 0.535075731575489, + "rewards/cosine_scaled_reward": -0.048758842051029205, + "rewards/format_reward": 0.2916666679084301, + "step": 99 + }, + { + "advantage_max": 1.725820317864418, + "advantage_mean": 3.104408063947517e-09, + "advantage_min": -1.057399995625019, + "advantage_std": 0.9998147934675217, + "completion_length": 2560.8958740234375, + "epoch": 0.11428571428571428, + "grad_norm": 0.18937556445598602, + "kl": 0.003207683563232422, + "lambda_div_used": 0.6, + "learning_rate": 9.728616793536587e-07, + "loss": 0.0001, + "reward": 0.13002754002809525, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.13002754002809525, + "reward_after_std": 0.7728552669286728, + "reward_before_mean": 0.6003823187202215, + "reward_before_std": 0.7967247776687145, + "reward_change_max": 0.0010514110326766968, + "reward_change_mean": -0.47035478707402945, + "reward_change_min": -0.9081873521208763, + "reward_change_std": 0.3621816970407963, + "reward_std": 0.7728552781045437, + "rewards/cosine_scaled_reward": 0.0501911542378366, + "rewards/format_reward": 0.5000000037252903, + "step": 100 + }, + { + "advantage_max": 1.7652879804372787, + "advantage_mean": 4.346172111091562e-08, + "advantage_min": -0.9487873390316963, + "advantage_std": 0.9997652173042297, + "completion_length": 3085.9375610351562, + "epoch": 0.11542857142857142, + "grad_norm": 0.179105743765831, + "kl": 0.0010350942611694336, + "lambda_div_used": 0.6, + "learning_rate": 9.717768952713511e-07, + "loss": 0.0, + "reward": -0.2374136783182621, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2374136783182621, + "reward_after_std": 0.5234212130308151, + "reward_before_mean": 0.0739324577152729, + "reward_before_std": 0.5113976262509823, + "reward_change_max": 0.0017821341753005981, + "reward_change_mean": -0.31134614115580916, + "reward_change_min": -0.5999995283782482, + "reward_change_std": 0.23880406375974417, + "reward_std": 0.5234212279319763, + "rewards/cosine_scaled_reward": -0.1088671050965786, + "rewards/format_reward": 0.29166667349636555, + "step": 101 + }, + { + "advantage_max": 1.736981987953186, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -1.1442973241209984, + "advantage_std": 0.9998510330915451, + "completion_length": 2414.479263305664, + "epoch": 0.11657142857142858, + "grad_norm": 0.27132582664489746, + "kl": 0.003232717514038086, + "lambda_div_used": 0.6, + "learning_rate": 9.706715543782064e-07, + "loss": 0.0001, + "reward": 0.18853361695073545, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.18853361695073545, + "reward_after_std": 0.879979819059372, + "reward_before_mean": 0.6658473797142506, + "reward_before_std": 0.907788585871458, + "reward_change_max": 0.00014173239469528198, + "reward_change_mean": -0.47731374204158783, + "reward_change_min": -0.8779811263084412, + "reward_change_std": 0.3685195595026016, + "reward_std": 0.8799798376858234, + "rewards/cosine_scaled_reward": -0.00040966711821965873, + "rewards/format_reward": 0.6666666828095913, + "step": 102 + }, + { + "advantage_max": 1.8154405057430267, + "advantage_mean": 2.669791498988161e-08, + "advantage_min": -0.9068902283906937, + "advantage_std": 0.9998216927051544, + "completion_length": 2932.8333740234375, + "epoch": 0.11771428571428572, + "grad_norm": 0.25712406635284424, + "kl": 0.0015873908996582031, + "lambda_div_used": 0.6, + "learning_rate": 9.695457105469804e-07, + "loss": 0.0001, + "reward": -0.15092550683766603, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.15092550683766603, + "reward_after_std": 0.7477401718497276, + "reward_before_mean": 0.1566409319639206, + "reward_before_std": 0.7541992217302322, + "reward_change_max": 0.0016285479068756104, + "reward_change_mean": -0.3075664332136512, + "reward_change_min": -0.6556785479187965, + "reward_change_std": 0.2573690786957741, + "reward_std": 0.7477401979267597, + "rewards/cosine_scaled_reward": -0.09876288007944822, + "rewards/format_reward": 0.3541666753590107, + "step": 103 + }, + { + "advantage_max": 1.8285162150859833, + "advantage_mean": 3.321717334525687e-08, + "advantage_min": -0.8637420684099197, + "advantage_std": 0.9997657388448715, + "completion_length": 2773.8541717529297, + "epoch": 0.11885714285714286, + "grad_norm": 0.658698320388794, + "kl": 0.025522232055664062, + "lambda_div_used": 0.6, + "learning_rate": 9.683994186497132e-07, + "loss": 0.001, + "reward": -0.2826954470947385, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.2826954470947385, + "reward_after_std": 0.6313493195921183, + "reward_before_mean": -0.023373490199446678, + "reward_before_std": 0.6353482659906149, + "reward_change_max": 0.0010068267583847046, + "reward_change_mean": -0.2593219568952918, + "reward_change_min": -0.5421112589538097, + "reward_change_std": 0.21279537491500378, + "reward_std": 0.6313493568450212, + "rewards/cosine_scaled_reward": -0.1991867497563362, + "rewards/format_reward": 0.37500000186264515, + "step": 104 + }, + { + "advantage_max": 1.783625602722168, + "advantage_mean": 3.3527614018424856e-08, + "advantage_min": -0.8666596114635468, + "advantage_std": 0.9997932240366936, + "completion_length": 2725.9375381469727, + "epoch": 0.12, + "grad_norm": 0.18521331250667572, + "kl": 0.0012688636779785156, + "lambda_div_used": 0.6, + "learning_rate": 9.672327345550543e-07, + "loss": 0.0001, + "reward": -0.015197938308119774, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.015197938308119774, + "reward_after_std": 0.8849128354340792, + "reward_before_mean": 0.3449369762092829, + "reward_before_std": 0.8991417735815048, + "reward_change_max": 0.0005147978663444519, + "reward_change_mean": -0.3601348986849189, + "reward_change_min": -0.7783688493072987, + "reward_change_std": 0.3188676042482257, + "reward_std": 0.8849128670990467, + "rewards/cosine_scaled_reward": -0.025448182597756386, + "rewards/format_reward": 0.39583333395421505, + "step": 105 + }, + { + "advantage_max": 1.8432470560073853, + "advantage_mean": -2.2972623803241277e-08, + "advantage_min": -0.8303454779088497, + "advantage_std": 0.9998474791646004, + "completion_length": 2239.6459045410156, + "epoch": 0.12114285714285715, + "grad_norm": 0.22106949985027313, + "kl": 0.0016461610794067383, + "lambda_div_used": 0.6, + "learning_rate": 9.66045715125541e-07, + "loss": 0.0001, + "reward": 0.34969139844179153, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.34969139844179153, + "reward_after_std": 0.8614957369863987, + "reward_before_mean": 0.9187089614570141, + "reward_before_std": 0.8012212971225381, + "reward_change_max": 0.001362532377243042, + "reward_change_mean": -0.5690175807103515, + "reward_change_min": -1.0385963395237923, + "reward_change_std": 0.417306674644351, + "reward_std": 0.8614957556128502, + "rewards/cosine_scaled_reward": 0.146854467689991, + "rewards/format_reward": 0.6250000093132257, + "step": 106 + }, + { + "advantage_max": 1.7680507004261017, + "advantage_mean": 2.301142987271021e-08, + "advantage_min": -0.9694222062826157, + "advantage_std": 0.99971604347229, + "completion_length": 2895.312530517578, + "epoch": 0.12228571428571429, + "grad_norm": 0.2039802074432373, + "kl": 0.001500844955444336, + "lambda_div_used": 0.6, + "learning_rate": 9.648384182148252e-07, + "loss": 0.0001, + "reward": -0.15957804769277573, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.15957804769277573, + "reward_after_std": 0.6128820450976491, + "reward_before_mean": 0.18090857192873955, + "reward_before_std": 0.6410299837589264, + "reward_change_max": 0.0003634542226791382, + "reward_change_mean": -0.3404866079799831, + "reward_change_min": -0.6619577445089817, + "reward_change_std": 0.2745391200296581, + "reward_std": 0.6128820804879069, + "rewards/cosine_scaled_reward": -0.14912904612720013, + "rewards/format_reward": 0.479166679084301, + "step": 107 + }, + { + "advantage_max": 1.790600210428238, + "advantage_mean": 5.960464632970286e-08, + "advantage_min": -0.9298330098390579, + "advantage_std": 0.9997816309332848, + "completion_length": 2844.6459045410156, + "epoch": 0.12342857142857143, + "grad_norm": 0.21611450612545013, + "kl": 0.0016851872205734253, + "lambda_div_used": 0.6, + "learning_rate": 9.636109026648554e-07, + "loss": 0.0001, + "reward": -0.1583491563796997, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1583491563796997, + "reward_after_std": 0.6582544650882483, + "reward_before_mean": 0.16968258982524276, + "reward_before_std": 0.6924717854708433, + "reward_change_max": 0.00043252110481262207, + "reward_change_mean": -0.3280317420139909, + "reward_change_min": -0.6706159263849258, + "reward_change_std": 0.2807430122047663, + "reward_std": 0.6582544837146997, + "rewards/cosine_scaled_reward": -0.09224203368648887, + "rewards/format_reward": 0.35416666977107525, + "step": 108 + }, + { + "advantage_max": 1.8136216551065445, + "advantage_mean": 4.656613178388724e-09, + "advantage_min": -0.9915246963500977, + "advantage_std": 0.9997651129961014, + "completion_length": 3080.9583435058594, + "epoch": 0.12457142857142857, + "grad_norm": 0.17112916707992554, + "kl": 0.0004984140396118164, + "lambda_div_used": 0.6, + "learning_rate": 9.623632283030077e-07, + "loss": 0.0, + "reward": -0.17264318838715553, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.17264318838715553, + "reward_after_std": 0.5608428791165352, + "reward_before_mean": 0.16347171552479267, + "reward_before_std": 0.5202469453215599, + "reward_change_max": 0.0016485974192619324, + "reward_change_mean": -0.33611492812633514, + "reward_change_min": -0.5799317136406898, + "reward_change_std": 0.22663516830652952, + "reward_std": 0.5608428884297609, + "rewards/cosine_scaled_reward": -0.09534747712314129, + "rewards/format_reward": 0.35416667722165585, + "step": 109 + }, + { + "advantage_max": 1.8396689295768738, + "advantage_mean": -5.587935614226325e-09, + "advantage_min": -0.8279101625084877, + "advantage_std": 0.9998233169317245, + "completion_length": 2519.7916717529297, + "epoch": 0.12571428571428572, + "grad_norm": 0.2504572570323944, + "kl": 0.0009617358446121216, + "lambda_div_used": 0.6, + "learning_rate": 9.610954559391704e-07, + "loss": 0.0, + "reward": -0.07773779518902302, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.07773779518902302, + "reward_after_std": 0.7135216481983662, + "reward_before_mean": 0.27838256349787116, + "reward_before_std": 0.696380527690053, + "reward_change_max": 0.0, + "reward_change_mean": -0.35612036008387804, + "reward_change_min": -0.7108714915812016, + "reward_change_std": 0.27885529957711697, + "reward_std": 0.7135216817259789, + "rewards/cosine_scaled_reward": -0.12122538778930902, + "rewards/format_reward": 0.5208333376795053, + "step": 110 + }, + { + "advantage_max": 1.7565907686948776, + "advantage_mean": -8.692344177774203e-09, + "advantage_min": -0.9142092913389206, + "advantage_std": 0.9998379349708557, + "completion_length": 2963.8750610351562, + "epoch": 0.12685714285714286, + "grad_norm": 0.19190770387649536, + "kl": 0.0017622709274291992, + "lambda_div_used": 0.6, + "learning_rate": 9.598076473627796e-07, + "loss": 0.0001, + "reward": -0.037100352346897125, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.037100352346897125, + "reward_after_std": 0.8680760376155376, + "reward_before_mean": 0.32125273160636425, + "reward_before_std": 0.9434991367161274, + "reward_change_max": 0.0022219568490982056, + "reward_change_mean": -0.3583531128242612, + "reward_change_min": -0.8165842406451702, + "reward_change_std": 0.3486758843064308, + "reward_std": 0.8680760562419891, + "rewards/cosine_scaled_reward": -0.02687364211305976, + "rewards/format_reward": 0.3750000074505806, + "step": 111 + }, + { + "advantage_max": 1.7827106267213821, + "advantage_mean": 2.7318795670083773e-08, + "advantage_min": -1.0682056844234467, + "advantage_std": 0.9998344331979752, + "completion_length": 3133.1459045410156, + "epoch": 0.128, + "grad_norm": 0.15433692932128906, + "kl": 0.0007636398077011108, + "lambda_div_used": 0.6, + "learning_rate": 9.58499865339809e-07, + "loss": 0.0, + "reward": -0.010375543497502804, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.010375543497502804, + "reward_after_std": 0.8647609390318394, + "reward_before_mean": 0.35546717746183276, + "reward_before_std": 0.8730155974626541, + "reward_change_max": 0.0010883510112762451, + "reward_change_mean": -0.3658427279442549, + "reward_change_min": -0.6690595299005508, + "reward_change_std": 0.28574431873857975, + "reward_std": 0.8647609725594521, + "rewards/cosine_scaled_reward": -0.009766413364559412, + "rewards/format_reward": 0.37500000931322575, + "step": 112 + }, + { + "advantage_max": 1.755420058965683, + "advantage_mean": 1.0710210274211818e-08, + "advantage_min": -0.9944314882159233, + "advantage_std": 0.9998151361942291, + "completion_length": 2773.0833740234375, + "epoch": 0.12914285714285714, + "grad_norm": 0.23051810264587402, + "kl": 0.0017020702362060547, + "lambda_div_used": 0.6, + "learning_rate": 9.571721736097088e-07, + "loss": 0.0001, + "reward": 0.07544285524636507, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07544285524636507, + "reward_after_std": 0.7961066942662001, + "reward_before_mean": 0.5120955022866838, + "reward_before_std": 0.8482234226539731, + "reward_change_max": 0.0005150362849235535, + "reward_change_mean": -0.4366526734083891, + "reward_change_min": -0.8510825112462044, + "reward_change_std": 0.35694314260035753, + "reward_std": 0.796106742694974, + "rewards/cosine_scaled_reward": 0.02688109502196312, + "rewards/format_reward": 0.4583333469927311, + "step": 113 + }, + { + "advantage_max": 1.8586469739675522, + "advantage_mean": 2.421438782818086e-08, + "advantage_min": -0.8431475088000298, + "advantage_std": 0.9998027980327606, + "completion_length": 2531.2083587646484, + "epoch": 0.13028571428571428, + "grad_norm": 0.22355306148529053, + "kl": 0.0019383430480957031, + "lambda_div_used": 0.6, + "learning_rate": 9.55824636882301e-07, + "loss": 0.0001, + "reward": -0.20113424118608236, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.20113424118608236, + "reward_after_std": 0.6546772718429565, + "reward_before_mean": 0.09727955237030983, + "reward_before_std": 0.6197393368929625, + "reward_change_max": 0.0007202774286270142, + "reward_change_mean": -0.29841379821300507, + "reward_change_min": -0.5926534906029701, + "reward_change_std": 0.22649162262678146, + "reward_std": 0.6546772830188274, + "rewards/cosine_scaled_reward": -0.24302690103650093, + "rewards/format_reward": 0.5833333414047956, + "step": 114 + }, + { + "advantage_max": 1.7997616976499557, + "advantage_mean": 4.159907551759545e-08, + "advantage_min": -1.0052898675203323, + "advantage_std": 0.999777115881443, + "completion_length": 2741.7500381469727, + "epoch": 0.13142857142857142, + "grad_norm": 0.22047275304794312, + "kl": 0.0026127099990844727, + "lambda_div_used": 0.6, + "learning_rate": 9.54457320834625e-07, + "loss": 0.0001, + "reward": -0.11932978220283985, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.11932978220283985, + "reward_after_std": 0.6451930701732635, + "reward_before_mean": 0.22971507906913757, + "reward_before_std": 0.6189654469490051, + "reward_change_max": 0.0001480579376220703, + "reward_change_mean": -0.3490448575466871, + "reward_change_min": -0.656039223074913, + "reward_change_std": 0.2516454681754112, + "reward_std": 0.6451930776238441, + "rewards/cosine_scaled_reward": -0.07264245301485062, + "rewards/format_reward": 0.375, + "step": 115 + }, + { + "advantage_max": 1.828330248594284, + "advantage_mean": 1.179675357398935e-08, + "advantage_min": -0.9229530841112137, + "advantage_std": 0.999760314822197, + "completion_length": 3289.166717529297, + "epoch": 0.13257142857142856, + "grad_norm": 0.16027842462062836, + "kl": 0.0012557506561279297, + "lambda_div_used": 0.6, + "learning_rate": 9.530702921077358e-07, + "loss": 0.0001, + "reward": -0.35777913220226765, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.35777913220226765, + "reward_after_std": 0.5930831618607044, + "reward_before_mean": -0.1336807757616043, + "reward_before_std": 0.5846659392118454, + "reward_change_max": 0.0006017833948135376, + "reward_change_mean": -0.22409836295992136, + "reward_change_min": -0.5018119886517525, + "reward_change_std": 0.18498014891520143, + "reward_std": 0.5930831879377365, + "rewards/cosine_scaled_reward": -0.1501737218350172, + "rewards/format_reward": 0.16666667349636555, + "step": 116 + }, + { + "advantage_max": 1.7930647432804108, + "advantage_mean": -2.4835270062695258e-08, + "advantage_min": -0.9088431373238564, + "advantage_std": 0.999772883951664, + "completion_length": 3074.291702270508, + "epoch": 0.1337142857142857, + "grad_norm": 0.19902725517749786, + "kl": 0.001920938491821289, + "lambda_div_used": 0.6, + "learning_rate": 9.516636183034564e-07, + "loss": 0.0001, + "reward": -0.41275100101483986, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.41275100101483986, + "reward_after_std": 0.6596652008593082, + "reward_before_mean": -0.2349538952112198, + "reward_before_std": 0.6833183541893959, + "reward_change_max": 0.004510059952735901, + "reward_change_mean": -0.177797120064497, + "reward_change_min": -0.4685539975762367, + "reward_change_std": 0.19438769109547138, + "reward_std": 0.6596652120351791, + "rewards/cosine_scaled_reward": -0.25289361737668514, + "rewards/format_reward": 0.27083334513008595, + "step": 117 + }, + { + "advantage_max": 1.7604702562093735, + "advantage_mean": -1.3038516932795119e-08, + "advantage_min": -0.8611695393919945, + "advantage_std": 0.9998533576726913, + "completion_length": 3151.0833740234375, + "epoch": 0.13485714285714287, + "grad_norm": 0.15056875348091125, + "kl": 0.0013856887817382812, + "lambda_div_used": 0.6, + "learning_rate": 9.502373679810839e-07, + "loss": 0.0001, + "reward": 0.09464032435789704, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09464032435789704, + "reward_after_std": 0.9480334632098675, + "reward_before_mean": 0.5092401094734669, + "reward_before_std": 1.007173541933298, + "reward_change_max": 0.0013965964317321777, + "reward_change_mean": -0.4145997939631343, + "reward_change_min": -0.9419260174036026, + "reward_change_std": 0.389494770206511, + "reward_std": 0.9480334855616093, + "rewards/cosine_scaled_reward": 0.05670338403433561, + "rewards/format_reward": 0.39583333767950535, + "step": 118 + }, + { + "advantage_max": 1.849794253706932, + "advantage_mean": 4.8428775767384025e-08, + "advantage_min": -0.9064371511340141, + "advantage_std": 0.9998284727334976, + "completion_length": 2609.3333587646484, + "epoch": 0.136, + "grad_norm": 0.22057241201400757, + "kl": 0.00370180606842041, + "lambda_div_used": 0.6, + "learning_rate": 9.487916106540465e-07, + "loss": 0.0001, + "reward": 0.03942138887941837, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.03942138887941837, + "reward_after_std": 0.7630004845559597, + "reward_before_mean": 0.4473726401338354, + "reward_before_std": 0.7058235108852386, + "reward_change_max": 0.0013832449913024902, + "reward_change_mean": -0.40795122273266315, + "reward_change_min": -0.7008799575269222, + "reward_change_std": 0.2818867303431034, + "reward_std": 0.7630005031824112, + "rewards/cosine_scaled_reward": -0.02631368301808834, + "rewards/format_reward": 0.5000000093132257, + "step": 119 + }, + { + "advantage_max": 1.8697483986616135, + "advantage_mean": 1.2107193081423162e-08, + "advantage_min": -0.9596693366765976, + "advantage_std": 0.999811477959156, + "completion_length": 2261.687515258789, + "epoch": 0.13714285714285715, + "grad_norm": 0.2415115386247635, + "kl": 0.0018281936645507812, + "lambda_div_used": 0.6, + "learning_rate": 9.473264167865171e-07, + "loss": 0.0001, + "reward": -0.050280665047466755, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.050280665047466755, + "reward_after_std": 0.7319730669260025, + "reward_before_mean": 0.31586154247634113, + "reward_before_std": 0.709919760003686, + "reward_change_max": 0.0, + "reward_change_mean": -0.3661421984434128, + "reward_change_min": -0.6760508343577385, + "reward_change_std": 0.2769299987703562, + "reward_std": 0.7319730892777443, + "rewards/cosine_scaled_reward": -0.12331923271995038, + "rewards/format_reward": 0.5625000111758709, + "step": 120 + }, + { + "advantage_max": 1.8391786068677902, + "advantage_mean": -2.7939677238464355e-09, + "advantage_min": -0.8033071458339691, + "advantage_std": 0.9997827708721161, + "completion_length": 1701.333366394043, + "epoch": 0.1382857142857143, + "grad_norm": 0.22365814447402954, + "kl": 0.0038230419158935547, + "lambda_div_used": 0.6, + "learning_rate": 9.458418577899774e-07, + "loss": 0.0002, + "reward": 0.16879624186549336, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16879624186549336, + "reward_after_std": 0.64923981949687, + "reward_before_mean": 0.6796352444216609, + "reward_before_std": 0.5928017403930426, + "reward_change_max": 0.0005354881286621094, + "reward_change_mean": -0.5108390115201473, + "reward_change_min": -0.8865525871515274, + "reward_change_std": 0.3496230151504278, + "reward_std": 0.6492398418486118, + "rewards/cosine_scaled_reward": -0.03518239036202431, + "rewards/format_reward": 0.7500000074505806, + "step": 121 + }, + { + "advantage_max": 1.7351529896259308, + "advantage_mean": 3.818422625312401e-08, + "advantage_min": -0.9226409643888474, + "advantage_std": 0.9997973814606667, + "completion_length": 3060.3333740234375, + "epoch": 0.13942857142857143, + "grad_norm": 0.18619462847709656, + "kl": 0.0013076066970825195, + "lambda_div_used": 0.6, + "learning_rate": 9.443380060197385e-07, + "loss": 0.0001, + "reward": -0.013334386050701141, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.013334386050701141, + "reward_after_std": 0.8127669915556908, + "reward_before_mean": 0.3716919384896755, + "reward_before_std": 0.8905900437384844, + "reward_change_max": 0.0006670430302619934, + "reward_change_mean": -0.3850262966006994, + "reward_change_min": -0.8647193722426891, + "reward_change_std": 0.3571057040244341, + "reward_std": 0.8127670250833035, + "rewards/cosine_scaled_reward": -0.0016540400683879852, + "rewards/format_reward": 0.3750000037252903, + "step": 122 + }, + { + "advantage_max": 1.8506823033094406, + "advantage_mean": -7.450579819767711e-09, + "advantage_min": -0.8011151403188705, + "advantage_std": 0.9998099207878113, + "completion_length": 2766.8334045410156, + "epoch": 0.14057142857142857, + "grad_norm": 0.17895764112472534, + "kl": 0.0018435120582580566, + "lambda_div_used": 0.6, + "learning_rate": 9.428149347714143e-07, + "loss": 0.0001, + "reward": -0.023045064299367368, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.023045064299367368, + "reward_after_std": 0.6246655620634556, + "reward_before_mean": 0.3796454730909318, + "reward_before_std": 0.5567172318696976, + "reward_change_max": 0.0005916282534599304, + "reward_change_mean": -0.40269055776298046, + "reward_change_min": -0.757130891084671, + "reward_change_std": 0.27373781334608793, + "reward_std": 0.6246655881404877, + "rewards/cosine_scaled_reward": -0.06017725728452206, + "rewards/format_reward": 0.5000000037252903, + "step": 123 + }, + { + "advantage_max": 1.891770288348198, + "advantage_mean": -2.0489097307674342e-08, + "advantage_min": -0.8343314006924629, + "advantage_std": 0.9998250231146812, + "completion_length": 2253.8125534057617, + "epoch": 0.1417142857142857, + "grad_norm": 0.21318885684013367, + "kl": 0.006275177001953125, + "lambda_div_used": 0.6, + "learning_rate": 9.412727182773486e-07, + "loss": 0.0003, + "reward": 0.07496419548988342, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.07496419548988342, + "reward_after_std": 0.7322327829897404, + "reward_before_mean": 0.5071105966344476, + "reward_before_std": 0.6644273046404123, + "reward_change_max": 0.0, + "reward_change_mean": -0.43214639462530613, + "reward_change_min": -0.7557884342968464, + "reward_change_std": 0.290377726778388, + "reward_std": 0.7322328351438046, + "rewards/cosine_scaled_reward": -0.017278052866458893, + "rewards/format_reward": 0.5416666716337204, + "step": 124 + }, + { + "advantage_max": 1.7362398356199265, + "advantage_mean": 3.3217173234234565e-08, + "advantage_min": -1.0559132620692253, + "advantage_std": 0.9997788593173027, + "completion_length": 2715.3125076293945, + "epoch": 0.14285714285714285, + "grad_norm": 0.1941165179014206, + "kl": 0.0016214847564697266, + "lambda_div_used": 0.6, + "learning_rate": 9.397114317029974e-07, + "loss": 0.0001, + "reward": 0.020387548953294754, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.020387548953294754, + "reward_after_std": 0.6161116883158684, + "reward_before_mean": 0.4553663581609726, + "reward_before_std": 0.5809119660407305, + "reward_change_max": 0.0006070658564567566, + "reward_change_mean": -0.4349788036197424, + "reward_change_min": -0.7705775275826454, + "reward_change_std": 0.31106082256883383, + "reward_std": 0.6161117069423199, + "rewards/cosine_scaled_reward": 0.0401831679046154, + "rewards/format_reward": 0.3750000037252903, + "step": 125 + }, + { + "advantage_max": 1.8594186007976532, + "advantage_mean": 1.2417635364414537e-08, + "advantage_min": -0.8952602446079254, + "advantage_std": 0.9997757375240326, + "completion_length": 2842.9583587646484, + "epoch": 0.144, + "grad_norm": 0.17380203306674957, + "kl": 0.0012089014053344727, + "lambda_div_used": 0.6, + "learning_rate": 9.381311511432658e-07, + "loss": 0.0, + "reward": -0.0640541547909379, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.0640541547909379, + "reward_after_std": 0.6584181226789951, + "reward_before_mean": 0.310584613122046, + "reward_before_std": 0.6167250759899616, + "reward_change_max": 4.930049180984497e-05, + "reward_change_mean": -0.37463877256959677, + "reward_change_min": -0.6681321784853935, + "reward_change_std": 0.258643782697618, + "reward_std": 0.6584181413054466, + "rewards/cosine_scaled_reward": -0.06345769576728344, + "rewards/format_reward": 0.4375000037252903, + "step": 126 + }, + { + "advantage_max": 1.7131058424711227, + "advantage_mean": -1.8626452602532595e-09, + "advantage_min": -1.0991980135440826, + "advantage_std": 0.9997560158371925, + "completion_length": 2911.8958892822266, + "epoch": 0.14514285714285713, + "grad_norm": 0.18738742172718048, + "kl": 0.0014913082122802734, + "lambda_div_used": 0.6, + "learning_rate": 9.36531953618799e-07, + "loss": 0.0001, + "reward": -0.37144492409424856, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.37144492409424856, + "reward_after_std": 0.4604080505669117, + "reward_before_mean": -0.11855738663871307, + "reward_before_std": 0.4947989024221897, + "reward_change_max": 0.001102253794670105, + "reward_change_mean": -0.2528875581920147, + "reward_change_min": -0.5120432004332542, + "reward_change_std": 0.21158719062805176, + "reward_std": 0.4604080617427826, + "rewards/cosine_scaled_reward": -0.25719536282122135, + "rewards/format_reward": 0.39583334885537624, + "step": 127 + }, + { + "advantage_max": 1.7644283175468445, + "advantage_mean": 3.725290298461914e-09, + "advantage_min": -0.978776179254055, + "advantage_std": 0.9997963681817055, + "completion_length": 2819.916702270508, + "epoch": 0.1462857142857143, + "grad_norm": 0.17408989369869232, + "kl": 0.0031111836433410645, + "lambda_div_used": 0.6, + "learning_rate": 9.34913917072228e-07, + "loss": 0.0001, + "reward": 0.17869412526488304, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.17869412526488304, + "reward_after_std": 0.796859186142683, + "reward_before_mean": 0.6695016473531723, + "reward_before_std": 0.8020838983356953, + "reward_change_max": 0.0008061006665229797, + "reward_change_mean": -0.49080754444003105, + "reward_change_min": -0.8655724823474884, + "reward_change_std": 0.3641576422378421, + "reward_std": 0.7968592196702957, + "rewards/cosine_scaled_reward": 0.11600081622600555, + "rewards/format_reward": 0.43750000558793545, + "step": 128 + }, + { + "advantage_max": 1.8531613051891327, + "advantage_mean": 6.146729142342267e-08, + "advantage_min": -0.9235868975520134, + "advantage_std": 0.9997715502977371, + "completion_length": 3387.125030517578, + "epoch": 0.14742857142857144, + "grad_norm": 0.14450214803218842, + "kl": 0.0020759105682373047, + "lambda_div_used": 0.6, + "learning_rate": 9.332771203643714e-07, + "loss": 0.0001, + "reward": -0.349605955183506, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.349605955183506, + "reward_after_std": 0.7235031314194202, + "reward_before_mean": -0.1519492152146995, + "reward_before_std": 0.7243870310485363, + "reward_change_max": 0.00021888315677642822, + "reward_change_mean": -0.1976567441597581, + "reward_change_min": -0.4697972945868969, + "reward_change_std": 0.1836132798343897, + "reward_std": 0.7235031351447105, + "rewards/cosine_scaled_reward": -0.159307939640712, + "rewards/format_reward": 0.1666666679084301, + "step": 129 + }, + { + "advantage_max": 1.775284931063652, + "advantage_mean": 5.3395829702207465e-08, + "advantage_min": -0.9660038575530052, + "advantage_std": 0.9997857511043549, + "completion_length": 2631.7500076293945, + "epoch": 0.14857142857142858, + "grad_norm": 0.19728270173072815, + "kl": 0.001338362693786621, + "lambda_div_used": 0.6, + "learning_rate": 9.316216432703916e-07, + "loss": 0.0001, + "reward": -0.13717409409582615, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.13717409409582615, + "reward_after_std": 0.6975272782146931, + "reward_before_mean": 0.1896309917792678, + "reward_before_std": 0.6852379832416773, + "reward_change_max": 0.000896163284778595, + "reward_change_mean": -0.326805068179965, + "reward_change_min": -0.6144087947905064, + "reward_change_std": 0.2421941850334406, + "reward_std": 0.69752730242908, + "rewards/cosine_scaled_reward": -0.11351784318685532, + "rewards/format_reward": 0.41666667349636555, + "step": 130 + }, + { + "advantage_max": 1.7923941612243652, + "advantage_mean": -2.2972623803241277e-08, + "advantage_min": -0.9002665691077709, + "advantage_std": 0.9998530149459839, + "completion_length": 2729.0417404174805, + "epoch": 0.14971428571428572, + "grad_norm": 0.21225287020206451, + "kl": 0.0025815963745117188, + "lambda_div_used": 0.6, + "learning_rate": 9.299475664759068e-07, + "loss": 0.0001, + "reward": 0.23085041716694832, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.23085041716694832, + "reward_after_std": 0.8556590303778648, + "reward_before_mean": 0.7364845387637615, + "reward_before_std": 0.8376536886207759, + "reward_change_max": 0.0, + "reward_change_mean": -0.5056341355666518, + "reward_change_min": -0.8724869564175606, + "reward_change_std": 0.37287682946771383, + "reward_std": 0.8556590564548969, + "rewards/cosine_scaled_reward": 0.1286589317023754, + "rewards/format_reward": 0.47916667349636555, + "step": 131 + }, + { + "advantage_max": 1.8120517283678055, + "advantage_mean": 1.3038516932795119e-08, + "advantage_min": -0.886481486260891, + "advantage_std": 0.9997847452759743, + "completion_length": 2587.0833587646484, + "epoch": 0.15085714285714286, + "grad_norm": 0.18867288529872894, + "kl": 0.001291036605834961, + "lambda_div_used": 0.6, + "learning_rate": 9.282549715730579e-07, + "loss": 0.0001, + "reward": -0.0929887518286705, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.0929887518286705, + "reward_after_std": 0.6387858018279076, + "reward_before_mean": 0.2717039901763201, + "reward_before_std": 0.6333675868809223, + "reward_change_max": 0.0013419762253761292, + "reward_change_mean": -0.3646927550435066, + "reward_change_min": -0.7294860184192657, + "reward_change_std": 0.2695095627568662, + "reward_std": 0.6387858055531979, + "rewards/cosine_scaled_reward": -0.07248133979737759, + "rewards/format_reward": 0.4166666679084301, + "step": 132 + }, + { + "advantage_max": 1.8003092557191849, + "advantage_mean": 3.476937815438674e-08, + "advantage_min": -0.9143991321325302, + "advantage_std": 0.9997724890708923, + "completion_length": 3191.979217529297, + "epoch": 0.152, + "grad_norm": 0.19934667646884918, + "kl": 0.002385854721069336, + "lambda_div_used": 0.6, + "learning_rate": 9.265439410565328e-07, + "loss": 0.0001, + "reward": -0.41168433986604214, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.41168433986604214, + "reward_after_std": 0.5410910733044147, + "reward_before_mean": -0.2069263681769371, + "reward_before_std": 0.535653218626976, + "reward_change_max": 0.00023845583200454712, + "reward_change_mean": -0.2047579661011696, + "reward_change_min": -0.42901289463043213, + "reward_change_std": 0.17053746804594994, + "reward_std": 0.541091077029705, + "rewards/cosine_scaled_reward": -0.2388798501342535, + "rewards/format_reward": 0.27083333767950535, + "step": 133 + }, + { + "advantage_max": 1.8175580650568008, + "advantage_mean": 5.4016709771786964e-08, + "advantage_min": -0.931298740208149, + "advantage_std": 0.9997761398553848, + "completion_length": 2434.8958892822266, + "epoch": 0.15314285714285714, + "grad_norm": 0.23071566224098206, + "kl": 0.0030927658081054688, + "lambda_div_used": 0.6, + "learning_rate": 9.248145583195447e-07, + "loss": 0.0001, + "reward": 0.06319417338818312, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06319417338818312, + "reward_after_std": 0.620172493159771, + "reward_before_mean": 0.5191458743065596, + "reward_before_std": 0.5582793261855841, + "reward_change_max": 0.0, + "reward_change_mean": -0.4559516739100218, + "reward_change_min": -0.7631092332303524, + "reward_change_std": 0.2891054190695286, + "reward_std": 0.6201725006103516, + "rewards/cosine_scaled_reward": -0.021677076816558838, + "rewards/format_reward": 0.5625000074505806, + "step": 134 + }, + { + "advantage_max": 1.8472566604614258, + "advantage_mean": 3.10440866346795e-08, + "advantage_min": -0.7691843509674072, + "advantage_std": 0.9998567774891853, + "completion_length": 1922.6458549499512, + "epoch": 0.15428571428571428, + "grad_norm": 0.2417742908000946, + "kl": 0.0037393569946289062, + "lambda_div_used": 0.6, + "learning_rate": 9.230669076497687e-07, + "loss": 0.0001, + "reward": 0.40880877897143364, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.40880877897143364, + "reward_after_std": 0.8995259329676628, + "reward_before_mean": 0.997649796307087, + "reward_before_std": 0.7795259989798069, + "reward_change_max": 0.0006740763783454895, + "reward_change_mean": -0.5888410690240562, + "reward_change_min": -1.0147056505084038, + "reward_change_std": 0.39061957970261574, + "reward_std": 0.8995259515941143, + "rewards/cosine_scaled_reward": 0.1654915688559413, + "rewards/format_reward": 0.6666666716337204, + "step": 135 + }, + { + "advantage_max": 1.7726252377033234, + "advantage_mean": 1.2728075149404106e-08, + "advantage_min": -0.8942654505372047, + "advantage_std": 0.999861553311348, + "completion_length": 2828.6875610351562, + "epoch": 0.15542857142857142, + "grad_norm": 0.18230877816677094, + "kl": 0.0026178359985351562, + "lambda_div_used": 0.6, + "learning_rate": 9.213010742252327e-07, + "loss": 0.0001, + "reward": 0.11790861561894417, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11790861561894417, + "reward_after_std": 0.9995119795203209, + "reward_before_mean": 0.5282619073987007, + "reward_before_std": 1.0388812385499477, + "reward_change_max": 0.00042983144521713257, + "reward_change_mean": -0.4103532899171114, + "reward_change_min": -0.8829576633870602, + "reward_change_std": 0.35299498960375786, + "reward_std": 0.9995119869709015, + "rewards/cosine_scaled_reward": 0.04538094159215689, + "rewards/format_reward": 0.4375000111758709, + "step": 136 + }, + { + "advantage_max": 1.7562323808670044, + "advantage_mean": 1.1486312123665243e-08, + "advantage_min": -0.9233672171831131, + "advantage_std": 0.9998031109571457, + "completion_length": 2931.354217529297, + "epoch": 0.15657142857142858, + "grad_norm": 0.20975585281848907, + "kl": 0.002144336700439453, + "lambda_div_used": 0.6, + "learning_rate": 9.195171441101668e-07, + "loss": 0.0001, + "reward": -0.2550590895116329, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.2550590895116329, + "reward_after_std": 0.7029409743845463, + "reward_before_mean": 0.0069596245884895325, + "reward_before_std": 0.7332433424890041, + "reward_change_max": 0.0006171539425849915, + "reward_change_mean": -0.2620187192223966, + "reward_change_min": -0.6090663559734821, + "reward_change_std": 0.23964617308229208, + "reward_std": 0.7029409967362881, + "rewards/cosine_scaled_reward": -0.15277018956840038, + "rewards/format_reward": 0.31250001303851604, + "step": 137 + }, + { + "advantage_max": 1.860204964876175, + "advantage_mean": -1.552203920951456e-09, + "advantage_min": -0.8499182164669037, + "advantage_std": 0.9998690113425255, + "completion_length": 2530.3542098999023, + "epoch": 0.15771428571428572, + "grad_norm": 0.25806108117103577, + "kl": 0.0020530223846435547, + "lambda_div_used": 0.6, + "learning_rate": 9.177152042508077e-07, + "loss": 0.0001, + "reward": 0.12369046686217189, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12369046686217189, + "reward_after_std": 0.9462379738688469, + "reward_before_mean": 0.5434232788684312, + "reward_before_std": 0.9309011660516262, + "reward_change_max": 0.0009085908532142639, + "reward_change_mean": -0.41973283141851425, + "reward_change_min": -0.8432405553758144, + "reward_change_std": 0.32539301738142967, + "reward_std": 0.9462379962205887, + "rewards/cosine_scaled_reward": -0.040788375306874514, + "rewards/format_reward": 0.6250000111758709, + "step": 138 + }, + { + "advantage_max": 1.8141157180070877, + "advantage_mean": 5.556891616298465e-08, + "advantage_min": -0.9482329413294792, + "advantage_std": 0.9998308569192886, + "completion_length": 3186.166717529297, + "epoch": 0.15885714285714286, + "grad_norm": 0.1826111525297165, + "kl": 0.00322723388671875, + "lambda_div_used": 0.6, + "learning_rate": 9.158953424711624e-07, + "loss": 0.0001, + "reward": -0.2945699542760849, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.2945699542760849, + "reward_after_std": 0.7745798341929913, + "reward_before_mean": -0.07500334223732352, + "reward_before_std": 0.7766986936330795, + "reward_change_max": 0.0012882798910140991, + "reward_change_mean": -0.21956659480929375, + "reward_change_min": -0.441211748868227, + "reward_change_std": 0.19897049106657505, + "reward_std": 0.7745798826217651, + "rewards/cosine_scaled_reward": -0.193751678802073, + "rewards/format_reward": 0.3125000074505806, + "step": 139 + }, + { + "advantage_max": 1.8250874429941177, + "advantage_mean": -2.6697914323747796e-08, + "advantage_min": -0.9064978882670403, + "advantage_std": 0.9998640343546867, + "completion_length": 2811.4375610351562, + "epoch": 0.16, + "grad_norm": 0.30310213565826416, + "kl": 0.005560874938964844, + "lambda_div_used": 0.6, + "learning_rate": 9.140576474687263e-07, + "loss": 0.0002, + "reward": 0.05557395005598664, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.05557395005598664, + "reward_after_std": 0.861349169164896, + "reward_before_mean": 0.45549297146499157, + "reward_before_std": 0.827749565243721, + "reward_change_max": 0.0005374252796173096, + "reward_change_mean": -0.3999190628528595, + "reward_change_min": -0.7423599883913994, + "reward_change_std": 0.31112305261194706, + "reward_std": 0.8613492026925087, + "rewards/cosine_scaled_reward": 0.008996479329653084, + "rewards/format_reward": 0.4375000149011612, + "step": 140 + }, + { + "advantage_max": 1.8470164686441422, + "advantage_mean": 1.3659397946064189e-08, + "advantage_min": -0.8646907731890678, + "advantage_std": 0.9998833239078522, + "completion_length": 2427.791717529297, + "epoch": 0.16114285714285714, + "grad_norm": 0.2331998348236084, + "kl": 0.00403594970703125, + "lambda_div_used": 0.6, + "learning_rate": 9.122022088101613e-07, + "loss": 0.0002, + "reward": 0.1419997257180512, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1419997257180512, + "reward_after_std": 1.0160457417368889, + "reward_before_mean": 0.5559970289468765, + "reward_before_std": 1.0078918114304543, + "reward_change_max": 0.0005534589290618896, + "reward_change_mean": -0.41399733535945415, + "reward_change_min": -0.8622670099139214, + "reward_change_std": 0.3442181684076786, + "reward_std": 1.01604575663805, + "rewards/cosine_scaled_reward": -0.013668144005350769, + "rewards/format_reward": 0.5833333469927311, + "step": 141 + }, + { + "advantage_max": 1.7220842242240906, + "advantage_mean": 2.3593506925934093e-08, + "advantage_min": -1.0945493131875992, + "advantage_std": 0.9998142942786217, + "completion_length": 2786.6250762939453, + "epoch": 0.16228571428571428, + "grad_norm": 0.20720338821411133, + "kl": 0.003205537796020508, + "lambda_div_used": 0.6, + "learning_rate": 9.103291169269299e-07, + "loss": 0.0001, + "reward": 0.00605709757655859, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.00605709757655859, + "reward_after_std": 0.7766943480819464, + "reward_before_mean": 0.41078952327370644, + "reward_before_std": 0.8526837788522243, + "reward_change_max": 5.05298376083374e-05, + "reward_change_mean": -0.40473244059830904, + "reward_change_min": -0.8016922771930695, + "reward_change_std": 0.34947027266025543, + "reward_std": 0.7766943946480751, + "rewards/cosine_scaled_reward": -0.05502191558480263, + "rewards/format_reward": 0.5208333469927311, + "step": 142 + }, + { + "advantage_max": 1.7795784920454025, + "advantage_mean": -7.140139590688932e-09, + "advantage_min": -0.9856926649808884, + "advantage_std": 0.999704547226429, + "completion_length": 2434.5416870117188, + "epoch": 0.16342857142857142, + "grad_norm": 0.33381637930870056, + "kl": 0.004268646240234375, + "lambda_div_used": 0.6, + "learning_rate": 9.084384631108882e-07, + "loss": 0.0002, + "reward": -0.1253709946759045, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1253709946759045, + "reward_after_std": 0.5376052083447576, + "reward_before_mean": 0.24621488712728024, + "reward_before_std": 0.5251807440072298, + "reward_change_max": 0.0004133358597755432, + "reward_change_mean": -0.3715858841314912, + "reward_change_min": -0.6634500622749329, + "reward_change_std": 0.2629025443457067, + "reward_std": 0.537605220451951, + "rewards/cosine_scaled_reward": -0.13730923272669315, + "rewards/format_reward": 0.5208333432674408, + "step": 143 + }, + { + "advantage_max": 1.8112081289291382, + "advantage_mean": 4.0357312713901194e-08, + "advantage_min": -0.8791662603616714, + "advantage_std": 0.9998252764344215, + "completion_length": 2935.9167098999023, + "epoch": 0.16457142857142856, + "grad_norm": 0.2265225052833557, + "kl": 0.0033435821533203125, + "lambda_div_used": 0.6, + "learning_rate": 9.065303395098358e-07, + "loss": 0.0001, + "reward": -0.005481253378093243, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.005481253378093243, + "reward_after_std": 0.7782573513686657, + "reward_before_mean": 0.3812122130766511, + "reward_before_std": 0.767244616523385, + "reward_change_max": 3.51443886756897e-05, + "reward_change_mean": -0.38669345434755087, + "reward_change_min": -0.7510642148554325, + "reward_change_std": 0.30695721600204706, + "reward_std": 0.7782573662698269, + "rewards/cosine_scaled_reward": 0.003106111893430352, + "rewards/format_reward": 0.3750000074505806, + "step": 144 + }, + { + "advantage_max": 1.8799023926258087, + "advantage_mean": 4.967053790494447e-08, + "advantage_min": -0.7879588454961777, + "advantage_std": 0.9997758939862251, + "completion_length": 2171.104179382324, + "epoch": 0.1657142857142857, + "grad_norm": 0.2688426971435547, + "kl": 0.004181623458862305, + "lambda_div_used": 0.6, + "learning_rate": 9.046048391230247e-07, + "loss": 0.0002, + "reward": 0.1414489287417382, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1414489287417382, + "reward_after_std": 0.6489234548062086, + "reward_before_mean": 0.6365663278847933, + "reward_before_std": 0.5827234704047441, + "reward_change_max": 0.0009598508477210999, + "reward_change_mean": -0.49511735793203115, + "reward_change_min": -0.8301094174385071, + "reward_change_std": 0.32179426960647106, + "reward_std": 0.6489234566688538, + "rewards/cosine_scaled_reward": 0.016199816949665546, + "rewards/format_reward": 0.6041666716337204, + "step": 145 + }, + { + "advantage_max": 1.8417692929506302, + "advantage_mean": 5.393909985329515e-09, + "advantage_min": -0.9344904869794846, + "advantage_std": 0.9997825846076012, + "completion_length": 2455.4583587646484, + "epoch": 0.16685714285714287, + "grad_norm": 0.2010597586631775, + "kl": 0.00258481502532959, + "lambda_div_used": 0.6, + "learning_rate": 9.026620557966279e-07, + "loss": 0.0001, + "reward": 0.019809929188340902, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.019809929188340902, + "reward_after_std": 0.6886911056935787, + "reward_before_mean": 0.4365716055035591, + "reward_before_std": 0.64860606379807, + "reward_change_max": 0.0, + "reward_change_mean": -0.4167616907507181, + "reward_change_min": -0.7325540669262409, + "reward_change_std": 0.2858335985802114, + "reward_std": 0.6886911205947399, + "rewards/cosine_scaled_reward": -0.08379753306508064, + "rewards/format_reward": 0.6041666828095913, + "step": 146 + }, + { + "advantage_max": 1.7888115048408508, + "advantage_mean": 4.035731260287889e-08, + "advantage_min": -0.9190972521901131, + "advantage_std": 0.9998320192098618, + "completion_length": 2554.45841217041, + "epoch": 0.168, + "grad_norm": 0.38465583324432373, + "kl": 0.004227638244628906, + "lambda_div_used": 0.6, + "learning_rate": 9.007020842191634e-07, + "loss": 0.0002, + "reward": 0.04151174915023148, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.04151174915023148, + "reward_after_std": 0.8868411891162395, + "reward_before_mean": 0.4299083799123764, + "reward_before_std": 0.9076585825532675, + "reward_change_max": 2.2679567337036133e-05, + "reward_change_mean": -0.3883966105058789, + "reward_change_min": -0.809001874178648, + "reward_change_std": 0.32688095327466726, + "reward_std": 0.886841207742691, + "rewards/cosine_scaled_reward": -0.024629172403365374, + "rewards/format_reward": 0.47916667349636555, + "step": 147 + }, + { + "advantage_max": 1.8225539922714233, + "advantage_mean": 1.1175871450497255e-08, + "advantage_min": -0.8709155693650246, + "advantage_std": 0.999858982861042, + "completion_length": 1892.8125762939453, + "epoch": 0.16914285714285715, + "grad_norm": 0.23031729459762573, + "kl": 0.0027790069580078125, + "lambda_div_used": 0.6, + "learning_rate": 8.987250199168808e-07, + "loss": 0.0001, + "reward": 0.13457794743590057, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.13457794743590057, + "reward_after_std": 0.8694209642708302, + "reward_before_mean": 0.5781211638823152, + "reward_before_std": 0.8416643925011158, + "reward_change_max": 0.0011159330606460571, + "reward_change_mean": -0.44354322366416454, + "reward_change_min": -0.8777766264975071, + "reward_change_std": 0.33100760076195, + "reward_std": 0.8694210052490234, + "rewards/cosine_scaled_reward": -0.0755227617919445, + "rewards/format_reward": 0.7291666716337204, + "step": 148 + }, + { + "advantage_max": 1.7685239017009735, + "advantage_mean": -7.450580818968433e-09, + "advantage_min": -0.9461576044559479, + "advantage_std": 0.9998336955904961, + "completion_length": 2667.6250610351562, + "epoch": 0.1702857142857143, + "grad_norm": 0.18191303312778473, + "kl": 0.003093719482421875, + "lambda_div_used": 0.6, + "learning_rate": 8.967309592491052e-07, + "loss": 0.0001, + "reward": -0.005377430468797684, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.005377430468797684, + "reward_after_std": 0.8236372619867325, + "reward_before_mean": 0.37823933735489845, + "reward_before_std": 0.8831204213202, + "reward_change_max": 0.0030940771102905273, + "reward_change_mean": -0.3836167808622122, + "reward_change_min": -0.8212070725858212, + "reward_change_std": 0.35686618462204933, + "reward_std": 0.8236372880637646, + "rewards/cosine_scaled_reward": -0.06088033691048622, + "rewards/format_reward": 0.5000000037252903, + "step": 149 + }, + { + "advantage_max": 1.6964322626590729, + "advantage_mean": 1.614292610696566e-08, + "advantage_min": -1.0401954725384712, + "advantage_std": 0.9998602271080017, + "completion_length": 2585.541732788086, + "epoch": 0.17142857142857143, + "grad_norm": 0.20473583042621613, + "kl": 0.004099845886230469, + "lambda_div_used": 0.6, + "learning_rate": 8.9471999940354e-07, + "loss": 0.0002, + "reward": 0.0429554358124733, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.0429554358124733, + "reward_after_std": 0.9450533464550972, + "reward_before_mean": 0.43351098895072937, + "reward_before_std": 1.051012322306633, + "reward_change_max": 0.0030746981501579285, + "reward_change_mean": -0.3905555624514818, + "reward_change_min": -0.9020018689334393, + "reward_change_std": 0.39117319881916046, + "reward_std": 0.9450533874332905, + "rewards/cosine_scaled_reward": -0.05407784227281809, + "rewards/format_reward": 0.5416666753590107, + "step": 150 + }, + { + "advantage_max": 1.7657774835824966, + "advantage_mean": -1.8626452158443385e-08, + "advantage_min": -1.0298929959535599, + "advantage_std": 0.9998343884944916, + "completion_length": 2374.2083892822266, + "epoch": 0.17257142857142857, + "grad_norm": 0.21564297378063202, + "kl": 0.004437446594238281, + "lambda_div_used": 0.6, + "learning_rate": 8.926922383915315e-07, + "loss": 0.0002, + "reward": 0.1870010308921337, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1870010308921337, + "reward_after_std": 0.81415681168437, + "reward_before_mean": 0.6816153451800346, + "reward_before_std": 0.8424667343497276, + "reward_change_max": 0.0006392896175384521, + "reward_change_mean": -0.4946143254637718, + "reward_change_min": -0.8876711316406727, + "reward_change_std": 0.3735849279910326, + "reward_std": 0.8141568228602409, + "rewards/cosine_scaled_reward": 0.028307669796049595, + "rewards/format_reward": 0.6250000149011612, + "step": 151 + }, + { + "advantage_max": 1.8536403626203537, + "advantage_mean": 3.4769377377230626e-08, + "advantage_min": -0.8554203286767006, + "advantage_std": 0.9997300058603287, + "completion_length": 2729.583354949951, + "epoch": 0.1737142857142857, + "grad_norm": 0.21470296382904053, + "kl": 0.005055665969848633, + "lambda_div_used": 0.6, + "learning_rate": 8.906477750432903e-07, + "loss": 0.0002, + "reward": -0.314574146643281, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.314574146643281, + "reward_after_std": 0.5909437406808138, + "reward_before_mean": -0.06655025156214833, + "reward_before_std": 0.5612615784630179, + "reward_change_max": 0.0003990978002548218, + "reward_change_mean": -0.24802388530224562, + "reward_change_min": -0.44809194654226303, + "reward_change_std": 0.17653987370431423, + "reward_std": 0.5909437648952007, + "rewards/cosine_scaled_reward": -0.21035847393795848, + "rewards/format_reward": 0.35416667722165585, + "step": 152 + }, + { + "advantage_max": 1.8638170510530472, + "advantage_mean": -1.4745941578908628e-08, + "advantage_min": -0.8921016380190849, + "advantage_std": 0.9998102560639381, + "completion_length": 2721.5000381469727, + "epoch": 0.17485714285714285, + "grad_norm": 0.22840584814548492, + "kl": 0.006916046142578125, + "lambda_div_used": 0.6, + "learning_rate": 8.88586709003076e-07, + "loss": 0.0003, + "reward": -0.3279695939272642, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.3279695939272642, + "reward_after_std": 0.6396501064300537, + "reward_before_mean": -0.09922650083899498, + "reward_before_std": 0.6324530653655529, + "reward_change_max": 0.0033247023820877075, + "reward_change_mean": -0.22874312056228518, + "reward_change_min": -0.46635738760232925, + "reward_change_std": 0.1982931261882186, + "reward_std": 0.6396501064300537, + "rewards/cosine_scaled_reward": -0.23711324855685234, + "rewards/format_reward": 0.3750000074505806, + "step": 153 + }, + { + "advantage_max": 1.7728360295295715, + "advantage_mean": 4.594524849466097e-08, + "advantage_min": -0.9388241320848465, + "advantage_std": 0.9998360797762871, + "completion_length": 3298.4584045410156, + "epoch": 0.176, + "grad_norm": 0.18289536237716675, + "kl": 0.0028214454650878906, + "lambda_div_used": 0.6, + "learning_rate": 8.865091407243394e-07, + "loss": 0.0001, + "reward": -0.10140408016741276, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.10140408016741276, + "reward_after_std": 0.8765006735920906, + "reward_before_mean": 0.21658623684197664, + "reward_before_std": 0.9439742900431156, + "reward_change_max": 0.001234501600265503, + "reward_change_mean": -0.3179903104901314, + "reward_change_min": -0.7761300541460514, + "reward_change_std": 0.3139498056843877, + "reward_std": 0.8765007182955742, + "rewards/cosine_scaled_reward": -0.05837355088442564, + "rewards/format_reward": 0.33333334140479565, + "step": 154 + }, + { + "advantage_max": 1.8248510509729385, + "advantage_mean": 7.76102093702491e-09, + "advantage_min": -0.8832328766584396, + "advantage_std": 0.9998307749629021, + "completion_length": 2646.291717529297, + "epoch": 0.17714285714285713, + "grad_norm": 0.21617059409618378, + "kl": 0.0036039352416992188, + "lambda_div_used": 0.6, + "learning_rate": 8.844151714648274e-07, + "loss": 0.0001, + "reward": 0.1411822196096182, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1411822196096182, + "reward_after_std": 0.7428156174719334, + "reward_before_mean": 0.6130893367808312, + "reward_before_std": 0.6941012628376484, + "reward_change_max": 0.0009594261646270752, + "reward_change_mean": -0.47190713416785, + "reward_change_min": -0.8309516459703445, + "reward_change_std": 0.322382427752018, + "reward_std": 0.7428156286478043, + "rewards/cosine_scaled_reward": 0.06696132896468043, + "rewards/format_reward": 0.47916666977107525, + "step": 155 + }, + { + "advantage_max": 1.722646802663803, + "advantage_mean": 7.710575078423432e-08, + "advantage_min": -1.010897733271122, + "advantage_std": 0.9998083412647247, + "completion_length": 2569.708366394043, + "epoch": 0.1782857142857143, + "grad_norm": 0.16158613562583923, + "kl": 0.0024747848510742188, + "lambda_div_used": 0.6, + "learning_rate": 8.823049032816478e-07, + "loss": 0.0001, + "reward": 0.026534391567111015, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.026534391567111015, + "reward_after_std": 0.7706789597868919, + "reward_before_mean": 0.44334378745406866, + "reward_before_std": 0.8541289437562227, + "reward_change_max": 0.0033616721630096436, + "reward_change_mean": -0.4168093828484416, + "reward_change_min": -0.8273204155266285, + "reward_change_std": 0.3676574770361185, + "reward_std": 0.7706789746880531, + "rewards/cosine_scaled_reward": -0.017911457223817706, + "rewards/format_reward": 0.4791666716337204, + "step": 156 + }, + { + "advantage_max": 1.8470103442668915, + "advantage_mean": -1.1486312068154092e-08, + "advantage_min": -0.8701950237154961, + "advantage_std": 0.9997879564762115, + "completion_length": 2702.687530517578, + "epoch": 0.17942857142857144, + "grad_norm": 0.2701130211353302, + "kl": 0.004548072814941406, + "lambda_div_used": 0.6, + "learning_rate": 8.801784390262943e-07, + "loss": 0.0002, + "reward": -0.20367415808141232, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.20367415808141232, + "reward_after_std": 0.6155711524188519, + "reward_before_mean": 0.10635744594037533, + "reward_before_std": 0.6107806749641895, + "reward_change_max": 0.0011181086301803589, + "reward_change_mean": -0.31003160774707794, + "reward_change_min": -0.5878064446151257, + "reward_change_std": 0.24444873072206974, + "reward_std": 0.6155711822211742, + "rewards/cosine_scaled_reward": -0.17598796007223427, + "rewards/format_reward": 0.4583333469927311, + "step": 157 + }, + { + "advantage_max": 1.7345101982355118, + "advantage_mean": 9.934107647602275e-09, + "advantage_min": -1.0180958062410355, + "advantage_std": 0.9998627975583076, + "completion_length": 2785.5834350585938, + "epoch": 0.18057142857142858, + "grad_norm": 0.2903349697589874, + "kl": 0.003916740417480469, + "lambda_div_used": 0.6, + "learning_rate": 8.780358823396352e-07, + "loss": 0.0002, + "reward": 0.1706813657656312, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1706813657656312, + "reward_after_std": 0.8751989975571632, + "reward_before_mean": 0.6363602974452078, + "reward_before_std": 0.9112723432481289, + "reward_change_max": 0.004944115877151489, + "reward_change_mean": -0.4656789507716894, + "reward_change_min": -0.87837153673172, + "reward_change_std": 0.37687744572758675, + "reward_std": 0.8751990273594856, + "rewards/cosine_scaled_reward": 0.09943014103919268, + "rewards/format_reward": 0.43750000558793545, + "step": 158 + }, + { + "advantage_max": 1.8704514354467392, + "advantage_mean": 8.940696871739817e-08, + "advantage_min": -0.7633762657642365, + "advantage_std": 0.9997115284204483, + "completion_length": 2753.2291717529297, + "epoch": 0.18171428571428572, + "grad_norm": 0.14626438915729523, + "kl": 0.004486083984375, + "lambda_div_used": 0.6, + "learning_rate": 8.758773376468604e-07, + "loss": 0.0002, + "reward": -0.38985342904925346, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.38985342904925346, + "reward_after_std": 0.40392925404012203, + "reward_before_mean": -0.14509601891040802, + "reward_before_std": 0.36197624169290066, + "reward_change_max": 0.001153610646724701, + "reward_change_mean": -0.24475740175694227, + "reward_change_min": -0.45411108434200287, + "reward_change_std": 0.16509783919900656, + "reward_std": 0.4039292652159929, + "rewards/cosine_scaled_reward": -0.2600480148103088, + "rewards/format_reward": 0.375, + "step": 159 + }, + { + "advantage_max": 1.8221765458583832, + "advantage_mean": 3.911554902202852e-08, + "advantage_min": -0.9147254899144173, + "advantage_std": 0.999821625649929, + "completion_length": 2687.145896911621, + "epoch": 0.18285714285714286, + "grad_norm": 0.2758440375328064, + "kl": 0.006741523742675781, + "lambda_div_used": 0.6, + "learning_rate": 8.737029101523929e-07, + "loss": 0.0003, + "reward": -0.13885139022022486, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.13885139022022486, + "reward_after_std": 0.7137699276208878, + "reward_before_mean": 0.18189829215407372, + "reward_before_std": 0.6966813541948795, + "reward_change_max": 0.004106998443603516, + "reward_change_mean": -0.3207496590912342, + "reward_change_min": -0.6812379769980907, + "reward_change_std": 0.27391286846250296, + "reward_std": 0.713769931346178, + "rewards/cosine_scaled_reward": -0.1173841985873878, + "rewards/format_reward": 0.41666667349636555, + "step": 160 + }, + { + "advantage_max": 1.7642860263586044, + "advantage_mean": 2.5456150853919723e-08, + "advantage_min": -0.9674390330910683, + "advantage_std": 0.9998326897621155, + "completion_length": 2567.875030517578, + "epoch": 0.184, + "grad_norm": 0.2193051278591156, + "kl": 0.006969451904296875, + "lambda_div_used": 0.6, + "learning_rate": 8.715127058347614e-07, + "loss": 0.0003, + "reward": 0.04702313430607319, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.04702313430607319, + "reward_after_std": 0.7199640087783337, + "reward_before_mean": 0.4766552746295929, + "reward_before_std": 0.709110327064991, + "reward_change_max": 0.0007743611931800842, + "reward_change_mean": -0.42963212728500366, + "reward_change_min": -0.8557399734854698, + "reward_change_std": 0.3297593742609024, + "reward_std": 0.7199640311300755, + "rewards/cosine_scaled_reward": -0.022089052945375443, + "rewards/format_reward": 0.5208333432674408, + "step": 161 + }, + { + "advantage_max": 1.6750542670488358, + "advantage_mean": 3.787378499708893e-08, + "advantage_min": -1.1022668778896332, + "advantage_std": 0.9998056441545486, + "completion_length": 3112.0208435058594, + "epoch": 0.18514285714285714, + "grad_norm": 0.18071846663951874, + "kl": 0.00931549072265625, + "lambda_div_used": 0.6, + "learning_rate": 8.693068314414344e-07, + "loss": 0.0004, + "reward": -0.08226488158106804, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.08226488158106804, + "reward_after_std": 0.717128112912178, + "reward_before_mean": 0.283852843567729, + "reward_before_std": 0.7811130806803703, + "reward_change_max": 0.0, + "reward_change_mean": -0.36611773632466793, + "reward_change_min": -0.7776030413806438, + "reward_change_std": 0.3201664984226227, + "reward_std": 0.7171281166374683, + "rewards/cosine_scaled_reward": -0.0455735728610307, + "rewards/format_reward": 0.3750000037252903, + "step": 162 + }, + { + "advantage_max": 1.8762730807065964, + "advantage_mean": -3.725290076417309e-09, + "advantage_min": -0.8003713823854923, + "advantage_std": 0.9998403191566467, + "completion_length": 2399.6041870117188, + "epoch": 0.18628571428571428, + "grad_norm": 0.21399880945682526, + "kl": 0.004780769348144531, + "lambda_div_used": 0.6, + "learning_rate": 8.670853944836176e-07, + "loss": 0.0002, + "reward": 0.26743081398308277, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.26743081398308277, + "reward_after_std": 0.7890615798532963, + "reward_before_mean": 0.7968443520367146, + "reward_before_std": 0.6803100085817277, + "reward_change_max": 0.0009098872542381287, + "reward_change_mean": -0.5294135119765997, + "reward_change_min": -0.8948076628148556, + "reward_change_std": 0.34071714151650667, + "reward_std": 0.7890615947544575, + "rewards/cosine_scaled_reward": 0.08592214062809944, + "rewards/format_reward": 0.6250000055879354, + "step": 163 + }, + { + "advantage_max": 1.8100349009037018, + "advantage_mean": 2.545615029880821e-08, + "advantage_min": -0.9734407514333725, + "advantage_std": 0.9997857436537743, + "completion_length": 2345.1667251586914, + "epoch": 0.18742857142857142, + "grad_norm": 0.2537487745285034, + "kl": 0.004994392395019531, + "lambda_div_used": 0.6, + "learning_rate": 8.648485032310144e-07, + "loss": 0.0002, + "reward": 0.16789760813117027, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16789760813117027, + "reward_after_std": 0.6567693762481213, + "reward_before_mean": 0.676574507728219, + "reward_before_std": 0.6022562142461538, + "reward_change_max": 0.0014617964625358582, + "reward_change_mean": -0.508676890283823, + "reward_change_min": -0.8728748597204685, + "reward_change_std": 0.3502998370677233, + "reward_std": 0.6567693911492825, + "rewards/cosine_scaled_reward": 0.03620391618460417, + "rewards/format_reward": 0.6041666753590107, + "step": 164 + }, + { + "advantage_max": 1.8189441114664078, + "advantage_mean": 1.6142925884921056e-08, + "advantage_min": -1.040967583656311, + "advantage_std": 0.9997537732124329, + "completion_length": 2431.6250610351562, + "epoch": 0.18857142857142858, + "grad_norm": 0.24487285315990448, + "kl": 0.005176544189453125, + "lambda_div_used": 0.6, + "learning_rate": 8.625962667065487e-07, + "loss": 0.0002, + "reward": -0.33520201966166496, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.33520201966166496, + "reward_after_std": 0.4438677877187729, + "reward_before_mean": -0.06665671244263649, + "reward_before_std": 0.4154568985104561, + "reward_change_max": 0.0014463141560554504, + "reward_change_mean": -0.2685453128069639, + "reward_change_min": -0.4497813992202282, + "reward_change_std": 0.1854629199951887, + "reward_std": 0.4438677951693535, + "rewards/cosine_scaled_reward": -0.25207835622131824, + "rewards/format_reward": 0.4375000111758709, + "step": 165 + }, + { + "advantage_max": 1.8155362904071808, + "advantage_mean": 5.898376925772553e-09, + "advantage_min": -0.8460855633020401, + "advantage_std": 0.999803401529789, + "completion_length": 2481.645881652832, + "epoch": 0.18971428571428572, + "grad_norm": 0.17104172706604004, + "kl": 0.003920555114746094, + "lambda_div_used": 0.6, + "learning_rate": 8.603287946810513e-07, + "loss": 0.0002, + "reward": -0.06894381903111935, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.06894381903111935, + "reward_after_std": 0.7122244127094746, + "reward_before_mean": 0.29707395657896996, + "reward_before_std": 0.7052637934684753, + "reward_change_max": 0.0003136247396469116, + "reward_change_mean": -0.3660177676938474, + "reward_change_min": -0.7036398574709892, + "reward_change_std": 0.28611435275524855, + "reward_std": 0.7122244350612164, + "rewards/cosine_scaled_reward": -0.11187970079481602, + "rewards/format_reward": 0.5208333432674408, + "step": 166 + }, + { + "advantage_max": 1.822963908314705, + "advantage_mean": -1.3659398390153399e-08, + "advantage_min": -0.856605276465416, + "advantage_std": 0.9998670220375061, + "completion_length": 2174.6667404174805, + "epoch": 0.19085714285714286, + "grad_norm": 0.1990378051996231, + "kl": 0.0035414695739746094, + "lambda_div_used": 0.6, + "learning_rate": 8.580461976679099e-07, + "loss": 0.0001, + "reward": 0.13649043417535722, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.13649043417535722, + "reward_after_std": 0.9280563369393349, + "reward_before_mean": 0.5691012926399708, + "reward_before_std": 0.9118999019265175, + "reward_change_max": 0.00039912760257720947, + "reward_change_mean": -0.432610847055912, + "reward_change_min": -0.8885245770215988, + "reward_change_std": 0.34525672532618046, + "reward_std": 0.9280563518404961, + "rewards/cosine_scaled_reward": -0.09044937463477254, + "rewards/format_reward": 0.7500000111758709, + "step": 167 + }, + { + "advantage_max": 1.8217055052518845, + "advantage_mean": 1.5832484961952886e-08, + "advantage_min": -0.9166247323155403, + "advantage_std": 0.9998238533735275, + "completion_length": 2714.3959197998047, + "epoch": 0.192, + "grad_norm": 0.2229907512664795, + "kl": 0.0042667388916015625, + "lambda_div_used": 0.6, + "learning_rate": 8.557485869176825e-07, + "loss": 0.0002, + "reward": -0.14375681872479618, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.14375681872479618, + "reward_after_std": 0.731827724725008, + "reward_before_mean": 0.17551885545253754, + "reward_before_std": 0.7568799871951342, + "reward_change_max": 0.0009675100445747375, + "reward_change_mean": -0.31927567534148693, + "reward_change_min": -0.7318533398211002, + "reward_change_std": 0.28689664974808693, + "reward_std": 0.7318277433514595, + "rewards/cosine_scaled_reward": -0.15182391228154302, + "rewards/format_reward": 0.47916668094694614, + "step": 168 + }, + { + "advantage_max": 1.8260702639818192, + "advantage_mean": -3.228585088166369e-08, + "advantage_min": -0.9273155555129051, + "advantage_std": 0.9998803958296776, + "completion_length": 1702.958381652832, + "epoch": 0.19314285714285714, + "grad_norm": 0.20557190477848053, + "kl": 0.0043087005615234375, + "lambda_div_used": 0.6, + "learning_rate": 8.534360744126753e-07, + "loss": 0.0002, + "reward": 0.6049684919416904, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6049684919416904, + "reward_after_std": 0.9428669586777687, + "reward_before_mean": 1.2980639263987541, + "reward_before_std": 0.870427917689085, + "reward_change_max": 0.0, + "reward_change_mean": -0.6930954158306122, + "reward_change_min": -1.1992580592632294, + "reward_change_std": 0.47029072418808937, + "reward_std": 0.9428669735789299, + "rewards/cosine_scaled_reward": 0.2219486115500331, + "rewards/format_reward": 0.854166679084301, + "step": 169 + }, + { + "advantage_max": 1.8410781174898148, + "advantage_mean": -2.0178655718572358e-08, + "advantage_min": -0.8170875944197178, + "advantage_std": 0.999827153980732, + "completion_length": 2252.854217529297, + "epoch": 0.19428571428571428, + "grad_norm": 0.24229690432548523, + "kl": 0.0041675567626953125, + "lambda_div_used": 0.6, + "learning_rate": 8.511087728614862e-07, + "loss": 0.0002, + "reward": 0.09339876628291677, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.09339876628291677, + "reward_after_std": 0.6783297061920166, + "reward_before_mean": 0.5516661715810187, + "reward_before_std": 0.592248173430562, + "reward_change_max": 0.0025824084877967834, + "reward_change_mean": -0.458267405629158, + "reward_change_min": -0.825487308204174, + "reward_change_std": 0.3272961787879467, + "reward_std": 0.6783297136425972, + "rewards/cosine_scaled_reward": 0.015416416805237532, + "rewards/format_reward": 0.5208333414047956, + "step": 170 + }, + { + "advantage_max": 1.8135200291872025, + "advantage_mean": 9.002785184009099e-09, + "advantage_min": -0.9001466482877731, + "advantage_std": 0.9998125210404396, + "completion_length": 2336.0208740234375, + "epoch": 0.19542857142857142, + "grad_norm": 0.22073253989219666, + "kl": 0.0040073394775390625, + "lambda_div_used": 0.6, + "learning_rate": 8.487667956935087e-07, + "loss": 0.0002, + "reward": -0.02373066544532776, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.02373066544532776, + "reward_after_std": 0.743452426046133, + "reward_before_mean": 0.3574244696646929, + "reward_before_std": 0.735035166144371, + "reward_change_max": 0.004195690155029297, + "reward_change_mean": -0.38115513836964965, + "reward_change_min": -0.7801596075296402, + "reward_change_std": 0.2982376590371132, + "reward_std": 0.7434524372220039, + "rewards/cosine_scaled_reward": -0.07128777727484703, + "rewards/format_reward": 0.5000000111758709, + "step": 171 + }, + { + "advantage_max": 1.835044652223587, + "advantage_mean": -2.3593505649177615e-08, + "advantage_min": -0.9643918573856354, + "advantage_std": 0.999810203909874, + "completion_length": 2526.7083435058594, + "epoch": 0.19657142857142856, + "grad_norm": 0.24751603603363037, + "kl": 0.005962371826171875, + "lambda_div_used": 0.6, + "learning_rate": 8.464102570534061e-07, + "loss": 0.0002, + "reward": 0.2530949041247368, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2530949041247368, + "reward_after_std": 0.7152044381946325, + "reward_before_mean": 0.7942209746688604, + "reward_before_std": 0.642445981502533, + "reward_change_max": 0.0018026381731033325, + "reward_change_mean": -0.5411260761320591, + "reward_change_min": -0.8798088431358337, + "reward_change_std": 0.3599870717152953, + "reward_std": 0.7152044512331486, + "rewards/cosine_scaled_reward": 0.15752714965492487, + "rewards/format_reward": 0.47916667349636555, + "step": 172 + }, + { + "advantage_max": 1.8556355088949203, + "advantage_mean": -2.220446049250313e-16, + "advantage_min": -0.8953605890274048, + "advantage_std": 0.9997614771127701, + "completion_length": 1534.0625343322754, + "epoch": 0.1977142857142857, + "grad_norm": 0.27481091022491455, + "kl": 0.004489898681640625, + "lambda_div_used": 0.6, + "learning_rate": 8.440392717955475e-07, + "loss": 0.0002, + "reward": 0.0394736472517252, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.0394736472517252, + "reward_after_std": 0.588263314217329, + "reward_before_mean": 0.48301520850509405, + "reward_before_std": 0.5054080621339381, + "reward_change_max": 0.0009268000721931458, + "reward_change_mean": -0.4435415752232075, + "reward_change_min": -0.653259627521038, + "reward_change_std": 0.2701799310743809, + "reward_std": 0.5882633421570063, + "rewards/cosine_scaled_reward": -0.12307572877034545, + "rewards/format_reward": 0.7291666679084301, + "step": 173 + }, + { + "advantage_max": 1.9079472124576569, + "advantage_mean": -3.4148496252939253e-09, + "advantage_min": -0.8583328798413277, + "advantage_std": 0.9998527467250824, + "completion_length": 1573.0208892822266, + "epoch": 0.19885714285714284, + "grad_norm": 0.3282460868358612, + "kl": 0.015633583068847656, + "lambda_div_used": 0.6, + "learning_rate": 8.416539554784089e-07, + "loss": 0.0006, + "reward": 0.3546122731640935, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3546122731640935, + "reward_after_std": 0.7721526771783829, + "reward_before_mean": 0.9311862445902079, + "reward_before_std": 0.6311989836394787, + "reward_change_max": 0.0, + "reward_change_mean": -0.5765739753842354, + "reward_change_min": -0.9307240545749664, + "reward_change_std": 0.3313662502914667, + "reward_std": 0.7721527107059956, + "rewards/cosine_scaled_reward": 0.03850978892296553, + "rewards/format_reward": 0.8541666716337204, + "step": 174 + }, + { + "advantage_max": 1.8203274607658386, + "advantage_mean": -1.3038516932795119e-08, + "advantage_min": -0.8509106487035751, + "advantage_std": 0.999811090528965, + "completion_length": 2614.9583587646484, + "epoch": 0.2, + "grad_norm": 0.1870286613702774, + "kl": 0.0050258636474609375, + "lambda_div_used": 0.6, + "learning_rate": 8.392544243589427e-07, + "loss": 0.0002, + "reward": 0.13974158838391304, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.13974158838391304, + "reward_after_std": 0.6508477814495564, + "reward_before_mean": 0.6339031849056482, + "reward_before_std": 0.5851920321583748, + "reward_change_max": 0.0010417401790618896, + "reward_change_mean": -0.49416152387857437, + "reward_change_min": -0.8415251597762108, + "reward_change_std": 0.3433941416442394, + "reward_std": 0.6508478038012981, + "rewards/cosine_scaled_reward": 0.06695156544446945, + "rewards/format_reward": 0.5, + "step": 175 + }, + { + "advantage_max": 1.8568035066127777, + "advantage_mean": 6.829699361610153e-09, + "advantage_min": -0.8530491329729557, + "advantage_std": 0.999842494726181, + "completion_length": 1939.8750534057617, + "epoch": 0.20114285714285715, + "grad_norm": 0.22778016328811646, + "kl": 0.0038318634033203125, + "lambda_div_used": 0.6, + "learning_rate": 8.368407953869103e-07, + "loss": 0.0002, + "reward": 0.2435381426475942, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2435381426475942, + "reward_after_std": 0.804627712816, + "reward_before_mean": 0.7599546858109534, + "reward_before_std": 0.7316133556887507, + "reward_change_max": 0.0, + "reward_change_mean": -0.5164165645837784, + "reward_change_min": -0.9660816714167595, + "reward_change_std": 0.3689718898385763, + "reward_std": 0.8046277277171612, + "rewards/cosine_scaled_reward": 0.015394015703350306, + "rewards/format_reward": 0.7291666716337204, + "step": 176 + }, + { + "advantage_max": 1.8439493477344513, + "advantage_mean": 7.799827006493842e-09, + "advantage_min": -0.9431622736155987, + "advantage_std": 0.9998452067375183, + "completion_length": 2418.000068664551, + "epoch": 0.2022857142857143, + "grad_norm": 0.21349747478961945, + "kl": 0.004181861877441406, + "lambda_div_used": 0.6, + "learning_rate": 8.344131861991828e-07, + "loss": 0.0002, + "reward": 0.23039765562862158, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.23039765562862158, + "reward_after_std": 0.7614034824073315, + "reward_before_mean": 0.7456597234122455, + "reward_before_std": 0.6677466258406639, + "reward_change_max": 0.0008978694677352905, + "reward_change_mean": -0.5152620803564787, + "reward_change_min": -0.8554310090839863, + "reward_change_std": 0.3227596618235111, + "reward_std": 0.7614035047590733, + "rewards/cosine_scaled_reward": 0.07074652705341578, + "rewards/format_reward": 0.6041666697710752, + "step": 177 + }, + { + "advantage_max": 1.8696839362382889, + "advantage_mean": 1.4280280069556284e-08, + "advantage_min": -0.8557280600070953, + "advantage_std": 0.9998672232031822, + "completion_length": 1876.7500305175781, + "epoch": 0.20342857142857143, + "grad_norm": 0.21730564534664154, + "kl": 0.00606536865234375, + "lambda_div_used": 0.6, + "learning_rate": 8.319717151140072e-07, + "loss": 0.0002, + "reward": 0.17197159988427302, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17197159988427302, + "reward_after_std": 0.9298111759126186, + "reward_before_mean": 0.6148823341354728, + "reward_before_std": 0.8897008560597897, + "reward_change_max": 0.0001780092716217041, + "reward_change_mean": -0.4429107364267111, + "reward_change_min": -0.8260507620871067, + "reward_change_std": 0.3189305029809475, + "reward_std": 0.9298111908137798, + "rewards/cosine_scaled_reward": -0.07797550270333886, + "rewards/format_reward": 0.7708333414047956, + "step": 178 + }, + { + "advantage_max": 1.7686524093151093, + "advantage_mean": 1.241763691872677e-09, + "advantage_min": -1.0116918981075287, + "advantage_std": 0.999799333512783, + "completion_length": 2257.770866394043, + "epoch": 0.20457142857142857, + "grad_norm": 0.23837533593177795, + "kl": 0.00466156005859375, + "lambda_div_used": 0.6, + "learning_rate": 8.295165011252396e-07, + "loss": 0.0002, + "reward": -0.06246851943433285, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.06246851943433285, + "reward_after_std": 0.6824757643043995, + "reward_before_mean": 0.3122526276856661, + "reward_before_std": 0.6739065647125244, + "reward_change_max": 0.000336281955242157, + "reward_change_mean": -0.3747211927548051, + "reward_change_min": -0.6586055941879749, + "reward_change_std": 0.2646934259682894, + "reward_std": 0.6824758015573025, + "rewards/cosine_scaled_reward": -0.10429035313427448, + "rewards/format_reward": 0.520833333954215, + "step": 179 + }, + { + "advantage_max": 1.8220677822828293, + "advantage_mean": -2.980232316485143e-08, + "advantage_min": -0.9002445340156555, + "advantage_std": 0.9998351335525513, + "completion_length": 1576.7083740234375, + "epoch": 0.2057142857142857, + "grad_norm": 0.259470134973526, + "kl": 0.006572723388671875, + "lambda_div_used": 0.6, + "learning_rate": 8.270476638965461e-07, + "loss": 0.0003, + "reward": 0.44151231832802296, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.44151231832802296, + "reward_after_std": 0.8216301538050175, + "reward_before_mean": 1.0700802188366652, + "reward_before_std": 0.7572079785168171, + "reward_change_max": 0.0004591718316078186, + "reward_change_mean": -0.628567922860384, + "reward_change_min": -1.0779259391129017, + "reward_change_std": 0.4193782526999712, + "reward_std": 0.821630172431469, + "rewards/cosine_scaled_reward": 0.12879010662436485, + "rewards/format_reward": 0.8125000074505806, + "step": 180 + }, + { + "advantage_max": 1.8709893822669983, + "advantage_mean": 3.197540909827268e-08, + "advantage_min": -0.8805593922734261, + "advantage_std": 0.999825045466423, + "completion_length": 2620.750030517578, + "epoch": 0.20685714285714285, + "grad_norm": 0.19660595059394836, + "kl": 0.00618743896484375, + "lambda_div_used": 0.6, + "learning_rate": 8.245653237555705e-07, + "loss": 0.0002, + "reward": 0.13136461284011602, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.13136461284011602, + "reward_after_std": 0.8224067687988281, + "reward_before_mean": 0.5772053925320506, + "reward_before_std": 0.7477911319583654, + "reward_change_max": 0.0013022571802139282, + "reward_change_mean": -0.44584078434854746, + "reward_change_min": -0.7741091437637806, + "reward_change_std": 0.3098747590556741, + "reward_std": 0.8224067911505699, + "rewards/cosine_scaled_reward": 0.038602693006396294, + "rewards/format_reward": 0.5000000074505806, + "step": 181 + }, + { + "advantage_max": 1.8176920861005783, + "advantage_mean": 2.204130183924846e-08, + "advantage_min": -0.9655944332480431, + "advantage_std": 0.9998245909810066, + "completion_length": 1977.2291946411133, + "epoch": 0.208, + "grad_norm": 0.1666197031736374, + "kl": 0.0028371810913085938, + "lambda_div_used": 0.6, + "learning_rate": 8.220696016880687e-07, + "loss": 0.0001, + "reward": 0.06960967741906643, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.06960967741906643, + "reward_after_std": 0.771950475871563, + "reward_before_mean": 0.49913039430975914, + "reward_before_std": 0.7607616037130356, + "reward_change_max": 0.0009039789438247681, + "reward_change_mean": -0.4295206815004349, + "reward_change_min": -0.7781954184174538, + "reward_change_std": 0.3130391491577029, + "reward_std": 0.7719505093991756, + "rewards/cosine_scaled_reward": -0.09418481879401952, + "rewards/format_reward": 0.6875000074505806, + "step": 182 + }, + { + "advantage_max": 1.780486524105072, + "advantage_mean": -4.967054045845742e-09, + "advantage_min": -1.0759043172001839, + "advantage_std": 0.9998541921377182, + "completion_length": 1652.2709045410156, + "epoch": 0.20914285714285713, + "grad_norm": 0.24588677287101746, + "kl": 0.006328582763671875, + "lambda_div_used": 0.6, + "learning_rate": 8.195606193320136e-07, + "loss": 0.0003, + "reward": 0.34637500741519034, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.34637500741519034, + "reward_after_std": 0.828385803848505, + "reward_before_mean": 0.9232957623898983, + "reward_before_std": 0.8138195388019085, + "reward_change_max": 0.0002490729093551636, + "reward_change_mean": -0.5769207254052162, + "reward_change_min": -0.9477719999849796, + "reward_change_std": 0.38944604992866516, + "reward_std": 0.8283858075737953, + "rewards/cosine_scaled_reward": 0.04498119559139013, + "rewards/format_reward": 0.8333333544433117, + "step": 183 + }, + { + "advantage_max": 1.8851233571767807, + "advantage_mean": 1.8626449271863521e-09, + "advantage_min": -0.9318333119153976, + "advantage_std": 0.9998067617416382, + "completion_length": 1994.8541831970215, + "epoch": 0.2102857142857143, + "grad_norm": 0.2858082056045532, + "kl": 0.0061016082763671875, + "lambda_div_used": 0.6, + "learning_rate": 8.170384989716657e-07, + "loss": 0.0002, + "reward": -0.03480223537189886, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.03480223537189886, + "reward_after_std": 0.575218565762043, + "reward_before_mean": 0.36644532158970833, + "reward_before_std": 0.47599741257727146, + "reward_change_max": 0.0002749115228652954, + "reward_change_mean": -0.40124754048883915, + "reward_change_min": -0.6298751458525658, + "reward_change_std": 0.2488370854407549, + "reward_std": 0.5752185806632042, + "rewards/cosine_scaled_reward": -0.15011069364845753, + "rewards/format_reward": 0.6666666753590107, + "step": 184 + }, + { + "advantage_max": 1.793496459722519, + "advantage_mean": 8.692344288796505e-09, + "advantage_min": -1.0042466521263123, + "advantage_std": 0.9997899830341339, + "completion_length": 1939.1250534057617, + "epoch": 0.21142857142857144, + "grad_norm": 0.2225847840309143, + "kl": 0.0044708251953125, + "lambda_div_used": 0.6, + "learning_rate": 8.145033635316128e-07, + "loss": 0.0002, + "reward": -0.03830873221158981, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.03830873221158981, + "reward_after_std": 0.541305024176836, + "reward_before_mean": 0.3769722692668438, + "reward_before_std": 0.5038091838359833, + "reward_change_max": 0.0, + "reward_change_mean": -0.41528095677495, + "reward_change_min": -0.6677366159856319, + "reward_change_std": 0.26928949914872646, + "reward_std": 0.5413050428032875, + "rewards/cosine_scaled_reward": -0.14484722539782524, + "rewards/format_reward": 0.666666679084301, + "step": 185 + }, + { + "advantage_max": 1.8013376891613007, + "advantage_mean": 3.97364305904091e-08, + "advantage_min": -0.9621232002973557, + "advantage_std": 0.999778263270855, + "completion_length": 2184.291702270508, + "epoch": 0.21257142857142858, + "grad_norm": 0.21466225385665894, + "kl": 0.005809783935546875, + "lambda_div_used": 0.6, + "learning_rate": 8.119553365707802e-07, + "loss": 0.0002, + "reward": -0.1045905682258308, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.1045905682258308, + "reward_after_std": 0.6125866509974003, + "reward_before_mean": 0.26320910919457674, + "reward_before_std": 0.6129262074828148, + "reward_change_max": 0.0007858425378799438, + "reward_change_mean": -0.3677996820770204, + "reward_change_min": -0.6774780340492725, + "reward_change_std": 0.2679179043043405, + "reward_std": 0.6125866509974003, + "rewards/cosine_scaled_reward": -0.14964544959366322, + "rewards/format_reward": 0.562500013038516, + "step": 186 + }, + { + "advantage_max": 1.853604942560196, + "advantage_mean": 1.0554989715583218e-08, + "advantage_min": -0.9163635894656181, + "advantage_std": 0.9997911527752876, + "completion_length": 1584.1458587646484, + "epoch": 0.21371428571428572, + "grad_norm": 0.30856335163116455, + "kl": 0.007468223571777344, + "lambda_div_used": 0.6, + "learning_rate": 8.093945422764069e-07, + "loss": 0.0003, + "reward": 0.07727741380222142, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07727741380222142, + "reward_after_std": 0.5403100177645683, + "reward_before_mean": 0.5557239428162575, + "reward_before_std": 0.4463806804269552, + "reward_change_max": 0.0005810186266899109, + "reward_change_mean": -0.478446539491415, + "reward_change_min": -0.7569740489125252, + "reward_change_std": 0.28474703803658485, + "reward_std": 0.5403100252151489, + "rewards/cosine_scaled_reward": -0.13880470301955938, + "rewards/format_reward": 0.8333333358168602, + "step": 187 + }, + { + "advantage_max": 1.826738864183426, + "advantage_mean": 1.071021038523412e-08, + "advantage_min": -0.9134683609008789, + "advantage_std": 0.9997814372181892, + "completion_length": 2628.9583587646484, + "epoch": 0.21485714285714286, + "grad_norm": 0.2049739956855774, + "kl": 0.0063419342041015625, + "lambda_div_used": 0.6, + "learning_rate": 8.068211054579943e-07, + "loss": 0.0003, + "reward": -0.13351251278072596, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.13351251278072596, + "reward_after_std": 0.6621253341436386, + "reward_before_mean": 0.20025232713669538, + "reward_before_std": 0.6175753083080053, + "reward_change_max": 0.0006356909871101379, + "reward_change_mean": -0.3337648417800665, + "reward_change_min": -0.5791598781943321, + "reward_change_std": 0.2304175542667508, + "reward_std": 0.6621253415942192, + "rewards/cosine_scaled_reward": -0.10820717085152864, + "rewards/format_reward": 0.4166666679084301, + "step": 188 + }, + { + "advantage_max": 1.8772830069065094, + "advantage_mean": 1.1175871450497255e-08, + "advantage_min": -0.7826530821621418, + "advantage_std": 0.9998155608773232, + "completion_length": 1632.5208854675293, + "epoch": 0.216, + "grad_norm": 0.24881505966186523, + "kl": 0.0060138702392578125, + "lambda_div_used": 0.6, + "learning_rate": 8.04235151541222e-07, + "loss": 0.0002, + "reward": 0.09520456660538912, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.09520456660538912, + "reward_after_std": 0.7265063393861055, + "reward_before_mean": 0.540639003738761, + "reward_before_std": 0.6537737995386124, + "reward_change_max": 0.0, + "reward_change_mean": -0.44543439242988825, + "reward_change_min": -0.7373391389846802, + "reward_change_std": 0.2890320150181651, + "reward_std": 0.7265063729137182, + "rewards/cosine_scaled_reward": -0.10468053352087736, + "rewards/format_reward": 0.7500000037252903, + "step": 189 + }, + { + "advantage_max": 1.9335829019546509, + "advantage_mean": 3.7252901874396116e-09, + "advantage_min": -0.7080438584089279, + "advantage_std": 0.9998445361852646, + "completion_length": 1214.8333740234375, + "epoch": 0.21714285714285714, + "grad_norm": 0.2329777032136917, + "kl": 0.004730224609375, + "lambda_div_used": 0.6, + "learning_rate": 8.01636806561836e-07, + "loss": 0.0002, + "reward": 0.33617127127945423, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.33617127127945423, + "reward_after_std": 0.7745129093527794, + "reward_before_mean": 0.9009418655186892, + "reward_before_std": 0.6023400258272886, + "reward_change_max": 0.0, + "reward_change_mean": -0.5647705905139446, + "reward_change_min": -0.8931130617856979, + "reward_change_std": 0.3234950266778469, + "reward_std": 0.7745129317045212, + "rewards/cosine_scaled_reward": -0.01827908866107464, + "rewards/format_reward": 0.9375, + "step": 190 + }, + { + "advantage_max": 1.9248203337192535, + "advantage_mean": 1.2417640249395845e-09, + "advantage_min": -0.7405965775251389, + "advantage_std": 0.9998944103717804, + "completion_length": 1295.3541984558105, + "epoch": 0.21828571428571428, + "grad_norm": 0.2606213688850403, + "kl": 0.006084442138671875, + "lambda_div_used": 0.6, + "learning_rate": 7.990261971595048e-07, + "loss": 0.0002, + "reward": 0.4639074660371989, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4639074660371989, + "reward_after_std": 1.0598181448876858, + "reward_before_mean": 1.0392481312155724, + "reward_before_std": 0.9292659349739552, + "reward_change_max": 0.0, + "reward_change_mean": -0.5753406658768654, + "reward_change_min": -1.0478120222687721, + "reward_change_std": 0.3785528726875782, + "reward_std": 1.0598181709647179, + "rewards/cosine_scaled_reward": 0.0821240646764636, + "rewards/format_reward": 0.8750000111758709, + "step": 191 + }, + { + "advantage_max": 1.851755753159523, + "advantage_mean": 4.346172255420555e-09, + "advantage_min": -0.9270682707428932, + "advantage_std": 0.9998274967074394, + "completion_length": 2081.604217529297, + "epoch": 0.21942857142857142, + "grad_norm": 0.205403134226799, + "kl": 0.005340576171875, + "lambda_div_used": 0.6, + "learning_rate": 7.964034505716476e-07, + "loss": 0.0002, + "reward": 0.044922725297510624, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.044922725297510624, + "reward_after_std": 0.7124022357165813, + "reward_before_mean": 0.46672992315143347, + "reward_before_std": 0.6790667753666639, + "reward_change_max": 0.00206892192363739, + "reward_change_mean": -0.4218071922659874, + "reward_change_min": -0.7962087951600552, + "reward_change_std": 0.29674767330288887, + "reward_std": 0.7124022506177425, + "rewards/cosine_scaled_reward": -0.11038506031036377, + "rewards/format_reward": 0.6875000242143869, + "step": 192 + }, + { + "advantage_max": 1.8514807373285294, + "advantage_mean": 1.986821618338297e-08, + "advantage_min": -0.8810537457466125, + "advantage_std": 0.9998353496193886, + "completion_length": 2794.187545776367, + "epoch": 0.22057142857142858, + "grad_norm": 0.2158997654914856, + "kl": 0.00675201416015625, + "lambda_div_used": 0.6, + "learning_rate": 7.93768694627233e-07, + "loss": 0.0003, + "reward": -0.12474042293615639, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.12474042293615639, + "reward_after_std": 0.8645276762545109, + "reward_before_mean": 0.1677975022175815, + "reward_before_std": 0.8510575741529465, + "reward_change_max": 0.0004886612296104431, + "reward_change_mean": -0.2925379406660795, + "reward_change_min": -0.6505083031952381, + "reward_change_std": 0.25192677415907383, + "reward_std": 0.8645276948809624, + "rewards/cosine_scaled_reward": -0.13485124241560698, + "rewards/format_reward": 0.43750000186264515, + "step": 193 + }, + { + "advantage_max": 1.8288027048110962, + "advantage_mean": -2.359350592673337e-08, + "advantage_min": -0.8574704006314278, + "advantage_std": 0.9998577386140823, + "completion_length": 2348.520851135254, + "epoch": 0.22171428571428572, + "grad_norm": 0.19087940454483032, + "kl": 0.005785942077636719, + "lambda_div_used": 0.6, + "learning_rate": 7.911220577405484e-07, + "loss": 0.0002, + "reward": 0.3952038553543389, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3952038553543389, + "reward_after_std": 0.9202344082295895, + "reward_before_mean": 0.9806380420923233, + "reward_before_std": 0.8940357845276594, + "reward_change_max": 0.0010172724723815918, + "reward_change_mean": -0.5854342035017908, + "reward_change_min": -1.0540791526436806, + "reward_change_std": 0.43106903275474906, + "reward_std": 0.9202344380319118, + "rewards/cosine_scaled_reward": 0.13615235313773155, + "rewards/format_reward": 0.7083333395421505, + "step": 194 + }, + { + "advantage_max": 1.8929940164089203, + "advantage_mean": -8.381903393583912e-09, + "advantage_min": -0.8231491893529892, + "advantage_std": 0.999856561422348, + "completion_length": 1668.1042175292969, + "epoch": 0.22285714285714286, + "grad_norm": 0.21075581014156342, + "kl": 0.006229400634765625, + "lambda_div_used": 0.6, + "learning_rate": 7.884636689049422e-07, + "loss": 0.0002, + "reward": 0.21669870428740978, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.21669870428740978, + "reward_after_std": 0.8580862581729889, + "reward_before_mean": 0.7015649350360036, + "reward_before_std": 0.7897386755794287, + "reward_change_max": 0.0, + "reward_change_mean": -0.4848662167787552, + "reward_change_min": -0.9058547914028168, + "reward_change_std": 0.32796779833734035, + "reward_std": 0.8580862581729889, + "rewards/cosine_scaled_reward": -0.05546754505485296, + "rewards/format_reward": 0.8125000055879354, + "step": 195 + }, + { + "advantage_max": 1.8233862221240997, + "advantage_mean": 1.83160115962977e-08, + "advantage_min": -1.0622686967253685, + "advantage_std": 0.9998345524072647, + "completion_length": 2790.8334350585938, + "epoch": 0.224, + "grad_norm": 0.22471381723880768, + "kl": 0.0061893463134765625, + "lambda_div_used": 0.6, + "learning_rate": 7.857936576865356e-07, + "loss": 0.0002, + "reward": -0.0495893070474267, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.0495893070474267, + "reward_after_std": 0.7146314792335033, + "reward_before_mean": 0.323355401866138, + "reward_before_std": 0.7049362137913704, + "reward_change_max": 0.002869725227355957, + "reward_change_mean": -0.37294470332562923, + "reward_change_min": -0.6940531842410564, + "reward_change_std": 0.28026892617344856, + "reward_std": 0.7146315313875675, + "rewards/cosine_scaled_reward": -0.10915563208982348, + "rewards/format_reward": 0.5416666772216558, + "step": 196 + }, + { + "advantage_max": 1.844907984137535, + "advantage_mean": -2.6697914323747796e-08, + "advantage_min": -0.9053036347031593, + "advantage_std": 0.9998667389154434, + "completion_length": 1176.5000305175781, + "epoch": 0.22514285714285714, + "grad_norm": 0.2676119804382324, + "kl": 0.006092071533203125, + "lambda_div_used": 0.6, + "learning_rate": 7.831121542179086e-07, + "loss": 0.0002, + "reward": 0.4123532408848405, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4123532408848405, + "reward_after_std": 0.920974288135767, + "reward_before_mean": 0.9987042904831469, + "reward_before_std": 0.8744922038167715, + "reward_change_max": 0.0, + "reward_change_mean": -0.5863510705530643, + "reward_change_min": -1.008712936192751, + "reward_change_std": 0.3892252929508686, + "reward_std": 0.9209743067622185, + "rewards/cosine_scaled_reward": 0.06185212981654331, + "rewards/format_reward": 0.8750000055879354, + "step": 197 + }, + { + "advantage_max": 1.8616387248039246, + "advantage_mean": 5.27749466350258e-09, + "advantage_min": -0.8514187633991241, + "advantage_std": 0.9998800158500671, + "completion_length": 1831.145896911621, + "epoch": 0.22628571428571428, + "grad_norm": 0.266026109457016, + "kl": 0.0087127685546875, + "lambda_div_used": 0.6, + "learning_rate": 7.804192891917571e-07, + "loss": 0.0003, + "reward": 0.21275895088911057, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.21275895088911057, + "reward_after_std": 0.9901725947856903, + "reward_before_mean": 0.6730580711737275, + "reward_before_std": 0.9783835522830486, + "reward_change_max": 0.0005161240696907043, + "reward_change_mean": -0.46029909048229456, + "reward_change_min": -0.8294646516442299, + "reward_change_std": 0.3421561336144805, + "reward_std": 0.9901726320385933, + "rewards/cosine_scaled_reward": -0.007220972329378128, + "rewards/format_reward": 0.6875000111758709, + "step": 198 + }, + { + "advantage_max": 1.869808241724968, + "advantage_mean": -6.829699250587851e-09, + "advantage_min": -0.9215875342488289, + "advantage_std": 0.9998573064804077, + "completion_length": 1522.3333892822266, + "epoch": 0.22742857142857142, + "grad_norm": 0.24699227511882782, + "kl": 0.006649017333984375, + "lambda_div_used": 0.6, + "learning_rate": 7.777151938545235e-07, + "loss": 0.0003, + "reward": 0.2865274213254452, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2865274213254452, + "reward_after_std": 0.829144075512886, + "reward_before_mean": 0.8162419125437737, + "reward_before_std": 0.7331322841346264, + "reward_change_max": 0.0, + "reward_change_mean": -0.5297144390642643, + "reward_change_min": -0.86338210105896, + "reward_change_std": 0.3267682734876871, + "reward_std": 0.829144112765789, + "rewards/cosine_scaled_reward": -0.06062907166779041, + "rewards/format_reward": 0.9375000149011612, + "step": 199 + }, + { + "advantage_max": 1.8471790850162506, + "advantage_mean": 8.692344288796505e-09, + "advantage_min": -0.9032345339655876, + "advantage_std": 0.99985521286726, + "completion_length": 1256.3958740234375, + "epoch": 0.22857142857142856, + "grad_norm": 0.23767714202404022, + "kl": 0.006656646728515625, + "lambda_div_used": 0.6, + "learning_rate": 7.75e-07, + "loss": 0.0003, + "reward": 0.37511107651516795, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.37511107651516795, + "reward_after_std": 0.8456577584147453, + "reward_before_mean": 0.95660699903965, + "reward_before_std": 0.7694406695663929, + "reward_change_max": 0.0009586066007614136, + "reward_change_mean": -0.5814958810806274, + "reward_change_min": -0.9831447154283524, + "reward_change_std": 0.37729886546730995, + "reward_std": 0.8456577733159065, + "rewards/cosine_scaled_reward": 0.019970136578194797, + "rewards/format_reward": 0.916666679084301, + "step": 200 + }, + { + "advantage_max": 1.8327146768569946, + "advantage_mean": -1.738468857759301e-08, + "advantage_min": -0.9730896577239037, + "advantage_std": 0.9998484998941422, + "completion_length": 1927.604232788086, + "epoch": 0.2297142857142857, + "grad_norm": 0.25266122817993164, + "kl": 0.0050640106201171875, + "lambda_div_used": 0.6, + "learning_rate": 7.72273839962904e-07, + "loss": 0.0002, + "reward": 0.6098271571099758, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.6098271571099758, + "reward_after_std": 0.86701575294137, + "reward_before_mean": 1.3185914978384972, + "reward_before_std": 0.7631920166313648, + "reward_change_max": 0.0, + "reward_change_mean": -0.7087643798440695, + "reward_change_min": -1.1464748308062553, + "reward_change_std": 0.45284009352326393, + "reward_std": 0.8670158125460148, + "rewards/cosine_scaled_reward": 0.2842957489192486, + "rewards/format_reward": 0.7500000149011612, + "step": 201 + }, + { + "advantage_max": 1.9493790417909622, + "advantage_mean": 2.483526384544632e-09, + "advantage_min": -0.7426571026444435, + "advantage_std": 0.9998154044151306, + "completion_length": 1572.9792175292969, + "epoch": 0.23085714285714284, + "grad_norm": 0.2120855450630188, + "kl": 0.0058650970458984375, + "lambda_div_used": 0.6, + "learning_rate": 7.695368466124296e-07, + "loss": 0.0002, + "reward": 0.481074046343565, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.481074046343565, + "reward_after_std": 0.6121453009545803, + "reward_before_mean": 1.1622148640453815, + "reward_before_std": 0.3702765265479684, + "reward_change_max": 0.0005568414926528931, + "reward_change_mean": -0.6811407832428813, + "reward_change_min": -0.9480189867317677, + "reward_change_std": 0.3608372900635004, + "reward_std": 0.6121453046798706, + "rewards/cosine_scaled_reward": 0.19569074362516403, + "rewards/format_reward": 0.7708333395421505, + "step": 202 + }, + { + "advantage_max": 1.860578492283821, + "advantage_mean": -3.1044056214568627e-10, + "advantage_min": -0.9007564336061478, + "advantage_std": 0.9998569637537003, + "completion_length": 1608.2708892822266, + "epoch": 0.232, + "grad_norm": 0.2628607451915741, + "kl": 0.008453369140625, + "lambda_div_used": 0.6, + "learning_rate": 7.667891533457718e-07, + "loss": 0.0003, + "reward": 0.25236531626433134, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.25236531626433134, + "reward_after_std": 0.8142339959740639, + "reward_before_mean": 0.7688097134232521, + "reward_before_std": 0.7407113090157509, + "reward_change_max": 0.0, + "reward_change_mean": -0.516444344073534, + "reward_change_min": -0.9059380665421486, + "reward_change_std": 0.3518046382814646, + "reward_std": 0.814234022051096, + "rewards/cosine_scaled_reward": -0.021845156326889992, + "rewards/format_reward": 0.8125000149011612, + "step": 203 + }, + { + "advantage_max": 1.856035828590393, + "advantage_mean": -1.0554989660072067e-08, + "advantage_min": -0.8498663008213043, + "advantage_std": 0.9998530447483063, + "completion_length": 1364.708381652832, + "epoch": 0.23314285714285715, + "grad_norm": 0.3182263970375061, + "kl": 0.00948333740234375, + "lambda_div_used": 0.6, + "learning_rate": 7.640308940816239e-07, + "loss": 0.0004, + "reward": 0.45779778249561787, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.45779778249561787, + "reward_after_std": 0.8125110119581223, + "reward_before_mean": 1.0898098573088646, + "reward_before_std": 0.7080512810498476, + "reward_change_max": 0.0, + "reward_change_mean": -0.632012028247118, + "reward_change_min": -1.0780591741204262, + "reward_change_std": 0.4018330071121454, + "reward_std": 0.8125110454857349, + "rewards/cosine_scaled_reward": 0.07615489140152931, + "rewards/format_reward": 0.9375000074505806, + "step": 204 + }, + { + "advantage_max": 1.831876128911972, + "advantage_mean": -1.241764135961887e-09, + "advantage_min": -0.8645824491977692, + "advantage_std": 0.9998550340533257, + "completion_length": 1722.0000381469727, + "epoch": 0.2342857142857143, + "grad_norm": 0.2667372226715088, + "kl": 0.0058498382568359375, + "lambda_div_used": 0.6, + "learning_rate": 7.612622032536507e-07, + "loss": 0.0002, + "reward": 0.5514694144949317, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5514694144949317, + "reward_after_std": 0.8580240793526173, + "reward_before_mean": 1.2323165144771338, + "reward_before_std": 0.7656144704669714, + "reward_change_max": 0.0, + "reward_change_mean": -0.6808471139520407, + "reward_change_min": -1.1130521781742573, + "reward_change_std": 0.45496470108628273, + "reward_std": 0.8580241277813911, + "rewards/cosine_scaled_reward": 0.22032492235302925, + "rewards/format_reward": 0.7916666753590107, + "step": 205 + }, + { + "advantage_max": 1.7926378697156906, + "advantage_mean": 2.1730859334212482e-09, + "advantage_min": -0.8720656037330627, + "advantage_std": 0.9998156204819679, + "completion_length": 2273.604217529297, + "epoch": 0.23542857142857143, + "grad_norm": 0.2223757952451706, + "kl": 0.0063800811767578125, + "lambda_div_used": 0.6, + "learning_rate": 7.584832158039378e-07, + "loss": 0.0003, + "reward": -0.14681187830865383, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.14681187830865383, + "reward_after_std": 0.6314323507249355, + "reward_before_mean": 0.18663795664906502, + "reward_before_std": 0.5911546461284161, + "reward_change_max": 0.0013406351208686829, + "reward_change_mean": -0.33344983868300915, + "reward_change_min": -0.6240692362189293, + "reward_change_std": 0.24192245677113533, + "reward_std": 0.6314323656260967, + "rewards/cosine_scaled_reward": -0.19834769575390965, + "rewards/format_reward": 0.5833333358168602, + "step": 206 + }, + { + "advantage_max": 1.8526040315628052, + "advantage_mean": -1.676380612103401e-08, + "advantage_min": -0.8606942147016525, + "advantage_std": 0.9998470023274422, + "completion_length": 1725.4167022705078, + "epoch": 0.23657142857142857, + "grad_norm": 0.32430991530418396, + "kl": 0.00893402099609375, + "lambda_div_used": 0.6, + "learning_rate": 7.556940671764124e-07, + "loss": 0.0004, + "reward": 0.19386290735565126, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19386290735565126, + "reward_after_std": 0.7678088247776031, + "reward_before_mean": 0.6845569107681513, + "reward_before_std": 0.7046758942306042, + "reward_change_max": 0.000944972038269043, + "reward_change_mean": -0.4906939994543791, + "reward_change_min": -0.8508261777460575, + "reward_change_std": 0.31916314363479614, + "reward_std": 0.767808835953474, + "rewards/cosine_scaled_reward": -0.05355490278452635, + "rewards/format_reward": 0.791666679084301, + "step": 207 + }, + { + "advantage_max": 1.839926853775978, + "advantage_mean": -4.03573130469681e-09, + "advantage_min": -0.9513271749019623, + "advantage_std": 0.9998404234647751, + "completion_length": 1415.0625228881836, + "epoch": 0.2377142857142857, + "grad_norm": 0.19833095371723175, + "kl": 0.006793975830078125, + "lambda_div_used": 0.6, + "learning_rate": 7.528948933102438e-07, + "loss": 0.0003, + "reward": 0.408255933172768, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.408255933172768, + "reward_after_std": 0.7480129301548004, + "reward_before_mean": 1.027973834425211, + "reward_before_std": 0.6748812645673752, + "reward_change_max": 0.0, + "reward_change_mean": -0.6197178699076176, + "reward_change_min": -0.9957869723439217, + "reward_change_std": 0.3788326345384121, + "reward_std": 0.7480129413306713, + "rewards/cosine_scaled_reward": 0.04523690603673458, + "rewards/format_reward": 0.9375000074505806, + "step": 208 + }, + { + "advantage_max": 1.8184748589992523, + "advantage_mean": 1.2107194080623884e-08, + "advantage_min": -1.0376386195421219, + "advantage_std": 0.9998557269573212, + "completion_length": 1503.3958702087402, + "epoch": 0.23885714285714285, + "grad_norm": 0.32442209124565125, + "kl": 0.00881195068359375, + "lambda_div_used": 0.6, + "learning_rate": 7.500858306332172e-07, + "loss": 0.0004, + "reward": 0.2773610055446625, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2773610055446625, + "reward_after_std": 0.744460966438055, + "reward_before_mean": 0.8234799510391895, + "reward_before_std": 0.6716284602880478, + "reward_change_max": 0.003235235810279846, + "reward_change_mean": -0.546118900179863, + "reward_change_min": -0.8975829593837261, + "reward_change_std": 0.3473086543381214, + "reward_std": 0.7444609776139259, + "rewards/cosine_scaled_reward": 0.01590662496164441, + "rewards/format_reward": 0.7916666753590107, + "step": 209 + }, + { + "advantage_max": 1.7693659961223602, + "advantage_mean": -3.1044084525255755e-09, + "advantage_min": -1.111761063337326, + "advantage_std": 0.9998294115066528, + "completion_length": 1683.0000305175781, + "epoch": 0.24, + "grad_norm": 0.1759955734014511, + "kl": 0.006259918212890625, + "lambda_div_used": 0.6, + "learning_rate": 7.472670160550848e-07, + "loss": 0.0003, + "reward": 0.21515829270356335, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.21515829270356335, + "reward_after_std": 0.7110634557902813, + "reward_before_mean": 0.7379409018903971, + "reward_before_std": 0.675159614533186, + "reward_change_max": 0.0, + "reward_change_mean": -0.5227825716137886, + "reward_change_min": -0.8667676635086536, + "reward_change_std": 0.33463000506162643, + "reward_std": 0.7110634669661522, + "rewards/cosine_scaled_reward": -0.03727956488728523, + "rewards/format_reward": 0.8125000149011612, + "step": 210 + }, + { + "advantage_max": 1.945738285779953, + "advantage_mean": 2.793967829317623e-08, + "advantage_min": -0.7639892995357513, + "advantage_std": 0.999823622405529, + "completion_length": 1397.020866394043, + "epoch": 0.24114285714285713, + "grad_norm": 0.2246849238872528, + "kl": 0.007781982421875, + "lambda_div_used": 0.6, + "learning_rate": 7.444385869608921e-07, + "loss": 0.0003, + "reward": 0.29541015811264515, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.29541015811264515, + "reward_after_std": 0.6922118216753006, + "reward_before_mean": 0.8543685399927199, + "reward_before_std": 0.5085874684154987, + "reward_change_max": 0.0004105120897293091, + "reward_change_mean": -0.5589583879336715, + "reward_change_min": -0.812763299793005, + "reward_change_std": 0.3149664308875799, + "reward_std": 0.6922118328511715, + "rewards/cosine_scaled_reward": 0.031350934877991676, + "rewards/format_reward": 0.7916666716337204, + "step": 211 + }, + { + "advantage_max": 1.8842217326164246, + "advantage_mean": -1.7384689132704523e-08, + "advantage_min": -0.9026188924908638, + "advantage_std": 0.999860368669033, + "completion_length": 1130.6667022705078, + "epoch": 0.2422857142857143, + "grad_norm": 0.2639113962650299, + "kl": 0.007846832275390625, + "lambda_div_used": 0.6, + "learning_rate": 7.416006812042827e-07, + "loss": 0.0003, + "reward": 0.506486542057246, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.506486542057246, + "reward_after_std": 0.8082983270287514, + "reward_before_mean": 1.1632536239922047, + "reward_before_std": 0.6659324653446674, + "reward_change_max": 0.0, + "reward_change_mean": -0.656767075881362, + "reward_change_min": -0.9595914296805859, + "reward_change_std": 0.38042050041258335, + "reward_std": 0.8082983493804932, + "rewards/cosine_scaled_reward": 0.1337101130047813, + "rewards/format_reward": 0.8958333395421505, + "step": 212 + }, + { + "advantage_max": 1.9224452078342438, + "advantage_mean": 1.862645149230957e-09, + "advantage_min": -0.7717320919036865, + "advantage_std": 0.9998671188950539, + "completion_length": 1562.6042022705078, + "epoch": 0.24342857142857144, + "grad_norm": 0.3185400366783142, + "kl": 0.010986328125, + "lambda_div_used": 0.6, + "learning_rate": 7.387534371007797e-07, + "loss": 0.0004, + "reward": 0.3678184101881925, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3678184101881925, + "reward_after_std": 0.9026115350425243, + "reward_before_mean": 0.9258643062785268, + "reward_before_std": 0.7626326903700829, + "reward_change_max": 0.0, + "reward_change_mean": -0.5580459162592888, + "reward_change_min": -0.9071638658642769, + "reward_change_std": 0.3532382473349571, + "reward_std": 0.9026115611195564, + "rewards/cosine_scaled_reward": 0.046265478784334846, + "rewards/format_reward": 0.8333333395421505, + "step": 213 + }, + { + "advantage_max": 1.8441757261753082, + "advantage_mean": -1.2883295541499251e-08, + "advantage_min": -0.9523207470774651, + "advantage_std": 0.9998089149594307, + "completion_length": 1889.5625228881836, + "epoch": 0.24457142857142858, + "grad_norm": 0.3396517336368561, + "kl": 0.008754730224609375, + "lambda_div_used": 0.6, + "learning_rate": 7.358969934210438e-07, + "loss": 0.0004, + "reward": 0.17065197927877307, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.17065197927877307, + "reward_after_std": 0.6326207704842091, + "reward_before_mean": 0.6843567192554474, + "reward_before_std": 0.5557381771504879, + "reward_change_max": 5.299597978591919e-05, + "reward_change_mean": -0.5137047506868839, + "reward_change_min": -0.8281576447188854, + "reward_change_std": 0.3360777208581567, + "reward_std": 0.63262078166008, + "rewards/cosine_scaled_reward": -0.04323832131922245, + "rewards/format_reward": 0.7708333432674408, + "step": 214 + }, + { + "advantage_max": 1.8047952502965927, + "advantage_mean": -6.984919156960423e-09, + "advantage_min": -0.949731320142746, + "advantage_std": 0.9998000115156174, + "completion_length": 1343.3750305175781, + "epoch": 0.24571428571428572, + "grad_norm": 0.28229522705078125, + "kl": 0.006084442138671875, + "lambda_div_used": 0.6, + "learning_rate": 7.330314893841101e-07, + "loss": 0.0002, + "reward": 0.11037498325458728, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11037498325458728, + "reward_after_std": 0.5453722551465034, + "reward_before_mean": 0.6039579696953297, + "reward_before_std": 0.4900472089648247, + "reward_change_max": 0.0009583085775375366, + "reward_change_mean": -0.49358299002051353, + "reward_change_min": -0.7654079273343086, + "reward_change_std": 0.29697378166019917, + "reward_std": 0.5453722700476646, + "rewards/cosine_scaled_reward": -0.12510435469448566, + "rewards/format_reward": 0.8541666716337204, + "step": 215 + }, + { + "advantage_max": 1.8079268336296082, + "advantage_mean": -4.3461718668424965e-09, + "advantage_min": -0.9888656884431839, + "advantage_std": 0.9998346269130707, + "completion_length": 1141.7291831970215, + "epoch": 0.24685714285714286, + "grad_norm": 0.2945360243320465, + "kl": 0.008228302001953125, + "lambda_div_used": 0.6, + "learning_rate": 7.301570646506027e-07, + "loss": 0.0003, + "reward": 0.5667337765917182, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5667337765917182, + "reward_after_std": 0.7148325145244598, + "reward_before_mean": 1.2823062911629677, + "reward_before_std": 0.599778238683939, + "reward_change_max": 0.0007646828889846802, + "reward_change_mean": -0.7155724987387657, + "reward_change_min": -1.0847780294716358, + "reward_change_std": 0.4218271281570196, + "reward_std": 0.7148325331509113, + "rewards/cosine_scaled_reward": 0.2036531288176775, + "rewards/format_reward": 0.875, + "step": 216 + }, + { + "advantage_max": 1.9577355980873108, + "advantage_mean": -3.476937759927523e-08, + "advantage_min": -0.6729211881756783, + "advantage_std": 0.9998743459582329, + "completion_length": 1361.7500381469727, + "epoch": 0.248, + "grad_norm": 0.23953408002853394, + "kl": 0.0068817138671875, + "lambda_div_used": 0.6, + "learning_rate": 7.27273859315928e-07, + "loss": 0.0003, + "reward": 0.3585895048454404, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3585895048454404, + "reward_after_std": 0.9671962819993496, + "reward_before_mean": 0.894859915599227, + "reward_before_std": 0.817153150215745, + "reward_change_max": 0.0, + "reward_change_mean": -0.5362704452127218, + "reward_change_min": -0.9340154454112053, + "reward_change_std": 0.35429083555936813, + "reward_std": 0.9671962969005108, + "rewards/cosine_scaled_reward": 0.020346628269180655, + "rewards/format_reward": 0.8541666679084301, + "step": 217 + }, + { + "advantage_max": 1.8746764808893204, + "advantage_mean": 6.208815683805824e-10, + "advantage_min": -0.8116071596741676, + "advantage_std": 0.9998414590954781, + "completion_length": 1506.9792251586914, + "epoch": 0.24914285714285714, + "grad_norm": 0.2781512141227722, + "kl": 0.006923675537109375, + "lambda_div_used": 0.6, + "learning_rate": 7.243820139034464e-07, + "loss": 0.0003, + "reward": 0.06333183636888862, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.06333183636888862, + "reward_after_std": 0.6848684512078762, + "reward_before_mean": 0.49841352133080363, + "reward_before_std": 0.6076563782989979, + "reward_change_max": 0.00116710364818573, + "reward_change_mean": -0.4350816644728184, + "reward_change_min": -0.7371813729405403, + "reward_change_std": 0.27808909490704536, + "reward_std": 0.6848684698343277, + "rewards/cosine_scaled_reward": -0.17787658236920834, + "rewards/format_reward": 0.854166679084301, + "step": 218 + }, + { + "advantage_max": 1.8024124205112457, + "advantage_mean": -1.6763806787167823e-08, + "advantage_min": -0.9114049077033997, + "advantage_std": 0.9998286366462708, + "completion_length": 1521.5833587646484, + "epoch": 0.2502857142857143, + "grad_norm": 0.29169994592666626, + "kl": 0.008913993835449219, + "lambda_div_used": 0.6, + "learning_rate": 7.214816693576234e-07, + "loss": 0.0004, + "reward": 0.35595322732115164, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.35595322732115164, + "reward_after_std": 0.7003892213106155, + "reward_before_mean": 0.959366999566555, + "reward_before_std": 0.6458369642496109, + "reward_change_max": 0.0, + "reward_change_mean": -0.6034137718379498, + "reward_change_min": -0.9950582347810268, + "reward_change_std": 0.3782676439732313, + "reward_std": 0.7003892250359058, + "rewards/cosine_scaled_reward": 0.05260014161467552, + "rewards/format_reward": 0.8541666697710752, + "step": 219 + }, + { + "advantage_max": 1.8146659433841705, + "advantage_mean": 1.614292521878724e-08, + "advantage_min": -1.0002572685480118, + "advantage_std": 0.999789372086525, + "completion_length": 1465.7708587646484, + "epoch": 0.25142857142857145, + "grad_norm": 0.3001108765602112, + "kl": 0.007877349853515625, + "lambda_div_used": 0.6, + "learning_rate": 7.185729670371604e-07, + "loss": 0.0003, + "reward": -0.0560973163228482, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.0560973163228482, + "reward_after_std": 0.5878814682364464, + "reward_before_mean": 0.33406518027186394, + "reward_before_std": 0.5440696775913239, + "reward_change_max": 0.0018651336431503296, + "reward_change_mean": -0.3901625070720911, + "reward_change_min": -0.6622885875403881, + "reward_change_std": 0.25839220359921455, + "reward_std": 0.5878814794123173, + "rewards/cosine_scaled_reward": -0.24963408894836903, + "rewards/format_reward": 0.8333333488553762, + "step": 220 + }, + { + "advantage_max": 1.9332973212003708, + "advantage_mean": -4.2840840319691154e-08, + "advantage_min": -0.6719999387860298, + "advantage_std": 0.9998391345143318, + "completion_length": 1181.5208740234375, + "epoch": 0.25257142857142856, + "grad_norm": 0.23093506693840027, + "kl": 0.005725860595703125, + "lambda_div_used": 0.6, + "learning_rate": 7.156560487081051e-07, + "loss": 0.0002, + "reward": 0.5290301127824932, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5290301127824932, + "reward_after_std": 0.7129516340792179, + "reward_before_mean": 1.2169622369110584, + "reward_before_std": 0.4925988847389817, + "reward_change_max": 0.0, + "reward_change_mean": -0.6879321187734604, + "reward_change_min": -1.0727623589336872, + "reward_change_std": 0.38971428014338017, + "reward_std": 0.7129516452550888, + "rewards/cosine_scaled_reward": 0.16056441084947437, + "rewards/format_reward": 0.895833333954215, + "step": 221 + }, + { + "advantage_max": 1.8785310834646225, + "advantage_mean": -1.179675268581093e-08, + "advantage_min": -0.8870665952563286, + "advantage_std": 0.9998476952314377, + "completion_length": 1451.041690826416, + "epoch": 0.2537142857142857, + "grad_norm": 0.27896323800086975, + "kl": 0.008668899536132812, + "lambda_div_used": 0.6, + "learning_rate": 7.127310565369415e-07, + "loss": 0.0003, + "reward": 0.2508357478072867, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2508357478072867, + "reward_after_std": 0.7334035821259022, + "reward_before_mean": 0.7815669402480125, + "reward_before_std": 0.6281927190721035, + "reward_change_max": 0.0011235624551773071, + "reward_change_mean": -0.530731188133359, + "reward_change_min": -0.8242725133895874, + "reward_change_std": 0.3266385905444622, + "reward_std": 0.7334035895764828, + "rewards/cosine_scaled_reward": 0.005366799421608448, + "rewards/format_reward": 0.7708333376795053, + "step": 222 + }, + { + "advantage_max": 1.8369051963090897, + "advantage_mean": 2.6697914656814703e-08, + "advantage_min": -0.8896066732704639, + "advantage_std": 0.9998190328478813, + "completion_length": 1759.4375305175781, + "epoch": 0.25485714285714284, + "grad_norm": 0.20690949261188507, + "kl": 0.007503509521484375, + "lambda_div_used": 0.6, + "learning_rate": 7.097981330836616e-07, + "loss": 0.0003, + "reward": 0.3009045707876794, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3009045707876794, + "reward_after_std": 0.6722562201321125, + "reward_before_mean": 0.8796945922076702, + "reward_before_std": 0.5771395843476057, + "reward_change_max": 0.0011511072516441345, + "reward_change_mean": -0.5787900015711784, + "reward_change_min": -0.9382209926843643, + "reward_change_std": 0.36548776365816593, + "reward_std": 0.6722562275826931, + "rewards/cosine_scaled_reward": 0.09609728120267391, + "rewards/format_reward": 0.687500013038516, + "step": 223 + }, + { + "advantage_max": 1.8380918353796005, + "advantage_mean": -1.117587122845265e-08, + "advantage_min": -0.8600411862134933, + "advantage_std": 0.9998573735356331, + "completion_length": 1852.3958740234375, + "epoch": 0.256, + "grad_norm": 0.19519250094890594, + "kl": 0.0054874420166015625, + "lambda_div_used": 0.6, + "learning_rate": 7.068574212948169e-07, + "loss": 0.0002, + "reward": 0.2570896605029702, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2570896605029702, + "reward_after_std": 0.8001469634473324, + "reward_before_mean": 0.7829917408525944, + "reward_before_std": 0.7353864461183548, + "reward_change_max": 0.0004997849464416504, + "reward_change_mean": -0.5259020812809467, + "reward_change_min": -0.9049793109297752, + "reward_change_std": 0.344460004940629, + "reward_std": 0.8001469932496548, + "rewards/cosine_scaled_reward": -0.03558747028000653, + "rewards/format_reward": 0.8541666697710752, + "step": 224 + }, + { + "advantage_max": 1.8619542717933655, + "advantage_mean": -2.4835264955669345e-09, + "advantage_min": -0.92620949447155, + "advantage_std": 0.9998472630977631, + "completion_length": 1862.6875495910645, + "epoch": 0.2571428571428571, + "grad_norm": 0.2828025817871094, + "kl": 0.01148223876953125, + "lambda_div_used": 0.6, + "learning_rate": 7.039090644965509e-07, + "loss": 0.0005, + "reward": 0.1418031924404204, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1418031924404204, + "reward_after_std": 0.8276291191577911, + "reward_before_mean": 0.5936808623373508, + "reward_before_std": 0.7773008048534393, + "reward_change_max": 0.00016012787818908691, + "reward_change_mean": -0.45187763683497906, + "reward_change_min": -0.8189064487814903, + "reward_change_std": 0.30999560933560133, + "reward_std": 0.8276291415095329, + "rewards/cosine_scaled_reward": -0.057326255831867456, + "rewards/format_reward": 0.7083333507180214, + "step": 225 + }, + { + "advantage_max": 1.9105704128742218, + "advantage_mean": 3.104408341503273e-09, + "advantage_min": -0.7978261336684227, + "advantage_std": 0.9998422935605049, + "completion_length": 1464.1667022705078, + "epoch": 0.2582857142857143, + "grad_norm": 0.21695449948310852, + "kl": 0.0057373046875, + "lambda_div_used": 0.6, + "learning_rate": 7.009532063876148e-07, + "loss": 0.0002, + "reward": 0.4355794661678374, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4355794661678374, + "reward_after_std": 0.7580258473753929, + "reward_before_mean": 1.0637911558151245, + "reward_before_std": 0.6122870594263077, + "reward_change_max": 0.0, + "reward_change_mean": -0.6282116770744324, + "reward_change_min": -0.9939616173505783, + "reward_change_std": 0.36170411482453346, + "reward_std": 0.7580258622765541, + "rewards/cosine_scaled_reward": 0.05272890208289027, + "rewards/format_reward": 0.9583333432674408, + "step": 226 + }, + { + "advantage_max": 1.880305826663971, + "advantage_mean": -9.313226134732844e-09, + "advantage_min": -0.8974742889404297, + "advantage_std": 0.9998089522123337, + "completion_length": 1090.1666946411133, + "epoch": 0.25942857142857145, + "grad_norm": 0.29648357629776, + "kl": 0.007843017578125, + "lambda_div_used": 0.6, + "learning_rate": 6.979899910323624e-07, + "loss": 0.0003, + "reward": 0.1774475951679051, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1774475951679051, + "reward_after_std": 0.6307259686291218, + "reward_before_mean": 0.6877003312110901, + "reward_before_std": 0.5237771086394787, + "reward_change_max": 0.0, + "reward_change_mean": -0.5102527514100075, + "reward_change_min": -0.827692836523056, + "reward_change_std": 0.295247383415699, + "reward_std": 0.6307259723544121, + "rewards/cosine_scaled_reward": -0.1561498325318098, + "rewards/format_reward": 1.0, + "step": 227 + }, + { + "advantage_max": 1.8451535552740097, + "advantage_mean": -9.934108202713787e-09, + "advantage_min": -0.9320072717964649, + "advantage_std": 0.9998236298561096, + "completion_length": 1345.3750228881836, + "epoch": 0.26057142857142856, + "grad_norm": 0.2331770807504654, + "kl": 0.00678253173828125, + "lambda_div_used": 0.6, + "learning_rate": 6.950195628537299e-07, + "loss": 0.0003, + "reward": 0.5653695005457848, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5653695005457848, + "reward_after_std": 0.6357578784227371, + "reward_before_mean": 1.294594332575798, + "reward_before_std": 0.46521945111453533, + "reward_change_max": 0.0007578954100608826, + "reward_change_mean": -0.729224868118763, + "reward_change_min": -1.0465231351554394, + "reward_change_std": 0.40724730119109154, + "reward_std": 0.6357578970491886, + "rewards/cosine_scaled_reward": 0.23063051141798496, + "rewards/format_reward": 0.8333333358168602, + "step": 228 + }, + { + "advantage_max": 1.8811430782079697, + "advantage_mean": -5.432715166620028e-09, + "advantage_min": -0.8746685832738876, + "advantage_std": 0.9998182356357574, + "completion_length": 1425.8542022705078, + "epoch": 0.26171428571428573, + "grad_norm": 0.2102670818567276, + "kl": 0.00843048095703125, + "lambda_div_used": 0.6, + "learning_rate": 6.920420666261961e-07, + "loss": 0.0003, + "reward": 0.21790653312928043, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.21790653312928043, + "reward_after_std": 0.6236037760972977, + "reward_before_mean": 0.7517698602750897, + "reward_before_std": 0.49720613472163677, + "reward_change_max": 0.0, + "reward_change_mean": -0.5338633358478546, + "reward_change_min": -0.8096239566802979, + "reward_change_std": 0.30554310977458954, + "reward_std": 0.6236037984490395, + "rewards/cosine_scaled_reward": -0.061615096405148506, + "rewards/format_reward": 0.8750000074505806, + "step": 229 + }, + { + "advantage_max": 1.8803343623876572, + "advantage_mean": 5.898376453927767e-09, + "advantage_min": -0.9339370280504227, + "advantage_std": 0.9997898861765862, + "completion_length": 1745.7916870117188, + "epoch": 0.26285714285714284, + "grad_norm": 0.20653089880943298, + "kl": 0.00656890869140625, + "lambda_div_used": 0.6, + "learning_rate": 6.890576474687263e-07, + "loss": 0.0003, + "reward": 0.045102519914507866, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.045102519914507866, + "reward_after_std": 0.5806776024401188, + "reward_before_mean": 0.49274973943829536, + "reward_before_std": 0.4847143702208996, + "reward_change_max": 0.0, + "reward_change_mean": -0.44764724373817444, + "reward_change_min": -0.7104385755956173, + "reward_change_std": 0.26539971493184566, + "reward_std": 0.5806776247918606, + "rewards/cosine_scaled_reward": -0.14945847261697054, + "rewards/format_reward": 0.791666679084301, + "step": 230 + }, + { + "advantage_max": 1.925698772072792, + "advantage_mean": -3.1044085080367267e-09, + "advantage_min": -0.7799288704991341, + "advantage_std": 0.9998774752020836, + "completion_length": 1532.3125381469727, + "epoch": 0.264, + "grad_norm": 0.22522780299186707, + "kl": 0.008289337158203125, + "lambda_div_used": 0.6, + "learning_rate": 6.860664508377001e-07, + "loss": 0.0003, + "reward": 0.49276258889585733, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.49276258889585733, + "reward_after_std": 0.9082431048154831, + "reward_before_mean": 1.1163499159738421, + "reward_before_std": 0.7488042302429676, + "reward_change_max": 0.0, + "reward_change_mean": -0.6235873140394688, + "reward_change_min": -0.9923263862729073, + "reward_change_std": 0.361046951264143, + "reward_std": 0.9082431308925152, + "rewards/cosine_scaled_reward": 0.09984159865416586, + "rewards/format_reward": 0.9166666716337204, + "step": 231 + }, + { + "advantage_max": 1.7980208545923233, + "advantage_mean": 1.6763806787167823e-08, + "advantage_min": -1.0510611981153488, + "advantage_std": 0.9998587220907211, + "completion_length": 2133.2917251586914, + "epoch": 0.2651428571428571, + "grad_norm": 0.259899377822876, + "kl": 0.00949859619140625, + "lambda_div_used": 0.6, + "learning_rate": 6.83068622519821e-07, + "loss": 0.0004, + "reward": -0.0644418615847826, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": -0.0644418615847826, + "reward_after_std": 0.7792413085699081, + "reward_before_mean": 0.28795089083723724, + "reward_before_std": 0.7958154529333115, + "reward_change_max": 0.0005818530917167664, + "reward_change_mean": -0.35239275731146336, + "reward_change_min": -0.7125130780041218, + "reward_change_std": 0.28704385086894035, + "reward_std": 0.7792413383722305, + "rewards/cosine_scaled_reward": -0.1997745493426919, + "rewards/format_reward": 0.6875000242143869, + "step": 232 + }, + { + "advantage_max": 1.9218673408031464, + "advantage_mean": -6.51925802230835e-09, + "advantage_min": -0.7960270717740059, + "advantage_std": 0.9998332411050797, + "completion_length": 1141.5208892822266, + "epoch": 0.2662857142857143, + "grad_norm": 0.22768016159534454, + "kl": 0.005718231201171875, + "lambda_div_used": 0.6, + "learning_rate": 6.800643086250121e-07, + "loss": 0.0002, + "reward": 0.2717605981742963, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2717605981742963, + "reward_after_std": 0.7569721080362797, + "reward_before_mean": 0.8070176914334297, + "reward_before_std": 0.6270077042281628, + "reward_change_max": 0.0, + "reward_change_mean": -0.53525710105896, + "reward_change_min": -0.835021048784256, + "reward_change_std": 0.3094108831137419, + "reward_std": 0.7569721303880215, + "rewards/cosine_scaled_reward": -0.06524115987122059, + "rewards/format_reward": 0.9375000074505806, + "step": 233 + }, + { + "advantage_max": 1.9480681866407394, + "advantage_mean": 3.104409063148239e-09, + "advantage_min": -0.6695451363921165, + "advantage_std": 0.9997960180044174, + "completion_length": 1621.6250267028809, + "epoch": 0.2674285714285714, + "grad_norm": 0.25094732642173767, + "kl": 0.008075714111328125, + "lambda_div_used": 0.6, + "learning_rate": 6.770536555792944e-07, + "loss": 0.0003, + "reward": 0.15022319613490254, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.15022319613490254, + "reward_after_std": 0.743482006713748, + "reward_before_mean": 0.6215195916593075, + "reward_before_std": 0.6112698167562485, + "reward_change_max": 0.0, + "reward_change_mean": -0.47129638865590096, + "reward_change_min": -0.8236280344426632, + "reward_change_std": 0.3019194579683244, + "reward_std": 0.7434820216149092, + "rewards/cosine_scaled_reward": -0.08507354371249676, + "rewards/format_reward": 0.7916666716337204, + "step": 234 + }, + { + "advantage_max": 1.9268249720335007, + "advantage_mean": -2.452482850134885e-08, + "advantage_min": -0.7404474690556526, + "advantage_std": 0.9998696148395538, + "completion_length": 1109.9167098999023, + "epoch": 0.26857142857142857, + "grad_norm": 0.24515864253044128, + "kl": 0.006595611572265625, + "lambda_div_used": 0.6, + "learning_rate": 6.740368101176495e-07, + "loss": 0.0003, + "reward": 0.5859628189355135, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5859628189355135, + "reward_after_std": 0.8730616346001625, + "reward_before_mean": 1.2709032725542784, + "reward_before_std": 0.6743786497972906, + "reward_change_max": 0.0007496699690818787, + "reward_change_mean": -0.684940479695797, + "reward_change_min": -1.027261570096016, + "reward_change_std": 0.39483822137117386, + "reward_std": 0.8730616644024849, + "rewards/cosine_scaled_reward": 0.1667016496649012, + "rewards/format_reward": 0.9375000074505806, + "step": 235 + }, + { + "advantage_max": 1.8696034252643585, + "advantage_mean": 1.8626452047421083e-09, + "advantage_min": -0.7846398204565048, + "advantage_std": 0.9998518154025078, + "completion_length": 1792.4584045410156, + "epoch": 0.26971428571428574, + "grad_norm": 0.21424059569835663, + "kl": 0.006832122802734375, + "lambda_div_used": 0.6, + "learning_rate": 6.710139192768694e-07, + "loss": 0.0003, + "reward": 0.21139677381142974, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.21139677381142974, + "reward_after_std": 0.9058232493698597, + "reward_before_mean": 0.685417864471674, + "reward_before_std": 0.8372405208647251, + "reward_change_max": 0.0, + "reward_change_mean": -0.4740210734307766, + "reward_change_min": -0.8449614234268665, + "reward_change_std": 0.3198982533067465, + "reward_std": 0.9058232642710209, + "rewards/cosine_scaled_reward": -0.04270775453187525, + "rewards/format_reward": 0.7708333358168602, + "step": 236 + }, + { + "advantage_max": 1.885415256023407, + "advantage_mean": -4.190951696791956e-09, + "advantage_min": -0.8234371915459633, + "advantage_std": 0.9998277798295021, + "completion_length": 1472.395866394043, + "epoch": 0.27085714285714285, + "grad_norm": 0.24223242700099945, + "kl": 0.00577545166015625, + "lambda_div_used": 0.6, + "learning_rate": 6.679851303883891e-07, + "loss": 0.0002, + "reward": 0.3880492812022567, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3880492812022567, + "reward_after_std": 0.7263778820633888, + "reward_before_mean": 0.9952268004417419, + "reward_before_std": 0.57838967256248, + "reward_change_max": 0.0, + "reward_change_mean": -0.607177471742034, + "reward_change_min": -0.9197544753551483, + "reward_change_std": 0.3471406549215317, + "reward_std": 0.7263779193162918, + "rewards/cosine_scaled_reward": 0.0601133843883872, + "rewards/format_reward": 0.875, + "step": 237 + }, + { + "advantage_max": 1.9389768987894058, + "advantage_mean": 2.1730859889323995e-09, + "advantage_min": -0.7344888895750046, + "advantage_std": 0.9998518079519272, + "completion_length": 1094.6667175292969, + "epoch": 0.272, + "grad_norm": 0.24234844744205475, + "kl": 0.00812530517578125, + "lambda_div_used": 0.6, + "learning_rate": 6.649505910711058e-07, + "loss": 0.0003, + "reward": 0.4462637463584542, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4462637463584542, + "reward_after_std": 0.868531309068203, + "reward_before_mean": 1.0553820058703423, + "reward_before_std": 0.7022292437031865, + "reward_change_max": 0.0, + "reward_change_mean": -0.6091182753443718, + "reward_change_min": -0.9754799082875252, + "reward_change_std": 0.3624349180608988, + "reward_std": 0.8685313127934933, + "rewards/cosine_scaled_reward": 0.03810766385868192, + "rewards/format_reward": 0.9791666716337204, + "step": 238 + }, + { + "advantage_max": 1.8280943185091019, + "advantage_mean": 1.3348957383918503e-08, + "advantage_min": -0.9786246418952942, + "advantage_std": 0.9998012036085129, + "completion_length": 1448.6041870117188, + "epoch": 0.27314285714285713, + "grad_norm": 0.22774183750152588, + "kl": 0.005977630615234375, + "lambda_div_used": 0.6, + "learning_rate": 6.619104492241847e-07, + "loss": 0.0002, + "reward": 0.5649529304355383, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5649529304355383, + "reward_after_std": 0.6479036398231983, + "reward_before_mean": 1.2952700648456812, + "reward_before_std": 0.51944032125175, + "reward_change_max": 0.0005435720086097717, + "reward_change_mean": -0.7303171027451754, + "reward_change_min": -1.0367931462824345, + "reward_change_std": 0.41321991570293903, + "reward_std": 0.6479036472737789, + "rewards/cosine_scaled_reward": 0.262218339368701, + "rewards/format_reward": 0.770833333954215, + "step": 239 + }, + { + "advantage_max": 1.7769131064414978, + "advantage_mean": 4.967053879312289e-09, + "advantage_min": -0.9458809569478035, + "advantage_std": 0.999803252518177, + "completion_length": 1617.9583740234375, + "epoch": 0.2742857142857143, + "grad_norm": 0.2748890519142151, + "kl": 0.009830474853515625, + "lambda_div_used": 0.6, + "learning_rate": 6.588648530198504e-07, + "loss": 0.0004, + "reward": -0.13307038694620132, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.13307038694620132, + "reward_after_std": 0.5256015434861183, + "reward_before_mean": 0.23460055608302355, + "reward_before_std": 0.5078816749155521, + "reward_change_max": 0.0, + "reward_change_mean": -0.367670938372612, + "reward_change_min": -0.6695005521178246, + "reward_change_std": 0.2524768877774477, + "reward_std": 0.5256015658378601, + "rewards/cosine_scaled_reward": -0.27853306010365486, + "rewards/format_reward": 0.7916666772216558, + "step": 240 + }, + { + "advantage_max": 1.88160939514637, + "advantage_mean": -1.1102230246251565e-16, + "advantage_min": -0.8645576983690262, + "advantage_std": 0.9998082891106606, + "completion_length": 1639.1667404174805, + "epoch": 0.2754285714285714, + "grad_norm": 0.26753294467926025, + "kl": 0.008953094482421875, + "lambda_div_used": 0.6, + "learning_rate": 6.558139508961654e-07, + "loss": 0.0004, + "reward": -0.031316899927333, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.031316899927333, + "reward_after_std": 0.5845673941075802, + "reward_before_mean": 0.37243654020130634, + "reward_before_std": 0.4885371960699558, + "reward_change_max": 0.0, + "reward_change_mean": -0.40375345945358276, + "reward_change_min": -0.650720402598381, + "reward_change_std": 0.24138164147734642, + "reward_std": 0.5845674015581608, + "rewards/cosine_scaled_reward": -0.25128174014389515, + "rewards/format_reward": 0.8750000111758709, + "step": 241 + }, + { + "advantage_max": 1.879174917936325, + "advantage_mean": -2.4835267176115394e-09, + "advantage_min": -0.8415133915841579, + "advantage_std": 0.9997778609395027, + "completion_length": 1141.4583549499512, + "epoch": 0.2765714285714286, + "grad_norm": 0.32908982038497925, + "kl": 0.011310577392578125, + "lambda_div_used": 0.6, + "learning_rate": 6.527578915497951e-07, + "loss": 0.0005, + "reward": 0.08597287524025887, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.08597287524025887, + "reward_after_std": 0.5845495834946632, + "reward_before_mean": 0.554448792245239, + "reward_before_std": 0.4750943690305576, + "reward_change_max": 0.0019485801458358765, + "reward_change_mean": -0.46847590431571007, + "reward_change_min": -0.7834129631519318, + "reward_change_std": 0.2873268108814955, + "reward_std": 0.5845495834946632, + "rewards/cosine_scaled_reward": -0.17069227993488312, + "rewards/format_reward": 0.8958333507180214, + "step": 242 + }, + { + "advantage_max": 1.9060489684343338, + "advantage_mean": -2.9491878938969762e-09, + "advantage_min": -0.8143318668007851, + "advantage_std": 0.9998864382505417, + "completion_length": 1576.0208587646484, + "epoch": 0.2777142857142857, + "grad_norm": 0.21973414719104767, + "kl": 0.0056400299072265625, + "lambda_div_used": 0.6, + "learning_rate": 6.496968239287603e-07, + "loss": 0.0002, + "reward": 0.404103375505656, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.404103375505656, + "reward_after_std": 0.9556259214878082, + "reward_before_mean": 0.9712791014462709, + "reward_before_std": 0.8410424143075943, + "reward_change_max": 0.00015924125909805298, + "reward_change_mean": -0.5671757366508245, + "reward_change_min": -0.9959732592105865, + "reward_change_std": 0.3573100995272398, + "reward_std": 0.9556259475648403, + "rewards/cosine_scaled_reward": 0.05855620512738824, + "rewards/format_reward": 0.8541666772216558, + "step": 243 + }, + { + "advantage_max": 1.839727371931076, + "advantage_mean": -1.7229468518564772e-08, + "advantage_min": -0.9795639514923096, + "advantage_std": 0.9998864531517029, + "completion_length": 1683.9375305175781, + "epoch": 0.27885714285714286, + "grad_norm": 0.2994227111339569, + "kl": 0.007843017578125, + "lambda_div_used": 0.6, + "learning_rate": 6.466308972251785e-07, + "loss": 0.0003, + "reward": 0.41605983674526215, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.41605983674526215, + "reward_after_std": 0.9633042551577091, + "reward_before_mean": 0.9971684440970421, + "reward_before_std": 0.9178141728043556, + "reward_change_max": 0.0008343681693077087, + "reward_change_mean": -0.5811086297035217, + "reward_change_min": -0.9604966826736927, + "reward_change_std": 0.3883028831332922, + "reward_std": 0.9633043184876442, + "rewards/cosine_scaled_reward": 0.09233421226963401, + "rewards/format_reward": 0.8125000055879354, + "step": 244 + }, + { + "advantage_max": 1.8073162287473679, + "advantage_mean": -6.208817238118058e-09, + "advantage_min": -0.9385262057185173, + "advantage_std": 0.9998811557888985, + "completion_length": 1858.3125610351562, + "epoch": 0.28, + "grad_norm": 0.23238350450992584, + "kl": 0.007617950439453125, + "lambda_div_used": 0.6, + "learning_rate": 6.435602608679916e-07, + "loss": 0.0003, + "reward": 0.29909435706213117, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.29909435706213117, + "reward_after_std": 0.9968580417335033, + "reward_before_mean": 0.8094470165669918, + "reward_before_std": 0.9918197207152843, + "reward_change_max": 0.0008577778935432434, + "reward_change_mean": -0.5103526413440704, + "reward_change_min": -1.0967907197773457, + "reward_change_std": 0.4003028068691492, + "reward_std": 0.996858075261116, + "rewards/cosine_scaled_reward": 0.019306830130517483, + "rewards/format_reward": 0.7708333469927311, + "step": 245 + }, + { + "advantage_max": 1.8786978721618652, + "advantage_mean": -1.3348957161873898e-08, + "advantage_min": -0.8943793699145317, + "advantage_std": 0.9998432323336601, + "completion_length": 1381.7500457763672, + "epoch": 0.28114285714285714, + "grad_norm": 0.23658344149589539, + "kl": 0.008426666259765625, + "lambda_div_used": 0.6, + "learning_rate": 6.404850645156841e-07, + "loss": 0.0003, + "reward": 0.27080630185082555, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.27080630185082555, + "reward_after_std": 0.7295896746218204, + "reward_before_mean": 0.814372431486845, + "reward_before_std": 0.6347568519413471, + "reward_change_max": 0.0006069615483283997, + "reward_change_mean": -0.5435661226511002, + "reward_change_min": -0.8393444195389748, + "reward_change_std": 0.32673146948218346, + "reward_std": 0.7295896857976913, + "rewards/cosine_scaled_reward": -0.040730470209382474, + "rewards/format_reward": 0.895833333954215, + "step": 246 + }, + { + "advantage_max": 1.8464445322752, + "advantage_mean": 1.8626452047421083e-09, + "advantage_min": -0.9952505975961685, + "advantage_std": 0.999820850789547, + "completion_length": 1874.3750534057617, + "epoch": 0.2822857142857143, + "grad_norm": 0.2620001435279846, + "kl": 0.0073604583740234375, + "lambda_div_used": 0.6, + "learning_rate": 6.374054580489873e-07, + "loss": 0.0003, + "reward": 0.04168115835636854, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.04168115835636854, + "reward_after_std": 0.6140828654170036, + "reward_before_mean": 0.48145512863993645, + "reward_before_std": 0.5548105500638485, + "reward_change_max": 0.0, + "reward_change_mean": -0.4397739823907614, + "reward_change_min": -0.7346926927566528, + "reward_change_std": 0.2879637535661459, + "reward_std": 0.6140829026699066, + "rewards/cosine_scaled_reward": -0.15510577894747257, + "rewards/format_reward": 0.7916666828095913, + "step": 247 + }, + { + "advantage_max": 1.928673580288887, + "advantage_mean": -3.663202191583892e-08, + "advantage_min": -0.7065148465335369, + "advantage_std": 0.9998361095786095, + "completion_length": 1277.9375267028809, + "epoch": 0.2834285714285714, + "grad_norm": 0.27247554063796997, + "kl": 0.0064296722412109375, + "lambda_div_used": 0.6, + "learning_rate": 6.343215915635761e-07, + "loss": 0.0003, + "reward": 0.6549166552722454, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6549166552722454, + "reward_after_std": 0.7043044194579124, + "reward_before_mean": 1.4125083833932877, + "reward_before_std": 0.4449807833880186, + "reward_change_max": 0.0, + "reward_change_mean": -0.7575917467474937, + "reward_change_min": -1.0937542356550694, + "reward_change_std": 0.4141359133645892, + "reward_std": 0.7043044343590736, + "rewards/cosine_scaled_reward": 0.27917084423825145, + "rewards/format_reward": 0.8541666716337204, + "step": 248 + }, + { + "advantage_max": 1.8969251960515976, + "advantage_mean": -2.5456151742098143e-08, + "advantage_min": -0.7646898031234741, + "advantage_std": 0.9998411610722542, + "completion_length": 1365.4792022705078, + "epoch": 0.2845714285714286, + "grad_norm": 0.22245629131793976, + "kl": 0.00811767578125, + "lambda_div_used": 0.6, + "learning_rate": 6.31233615362752e-07, + "loss": 0.0003, + "reward": 0.5752901515224949, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5752901515224949, + "reward_after_std": 0.7512060441076756, + "reward_before_mean": 1.2835528128780425, + "reward_before_std": 0.5859524626284838, + "reward_change_max": 0.0, + "reward_change_mean": -0.7082626335322857, + "reward_change_min": -1.0432622693479061, + "reward_change_std": 0.403097290545702, + "reward_std": 0.7512060552835464, + "rewards/cosine_scaled_reward": 0.2146930517628789, + "rewards/format_reward": 0.8541666697710752, + "step": 249 + }, + { + "advantage_max": 1.878608837723732, + "advantage_mean": -4.967053768289986e-09, + "advantage_min": -0.8505470529198647, + "advantage_std": 0.9998346194624901, + "completion_length": 1293.0625534057617, + "epoch": 0.2857142857142857, + "grad_norm": 0.4508991539478302, + "kl": 0.009288787841796875, + "lambda_div_used": 0.6, + "learning_rate": 6.281416799501187e-07, + "loss": 0.0004, + "reward": 0.303747734404169, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.303747734404169, + "reward_after_std": 0.7406436167657375, + "reward_before_mean": 0.863355714827776, + "reward_before_std": 0.628277663141489, + "reward_change_max": 0.0006925761699676514, + "reward_change_mean": -0.5596079789102077, + "reward_change_min": -0.8469617292284966, + "reward_change_std": 0.3360453639179468, + "reward_std": 0.7406436540186405, + "rewards/cosine_scaled_reward": -0.047488822834566236, + "rewards/format_reward": 0.9583333432674408, + "step": 250 + }, + { + "advantage_max": 1.872517004609108, + "advantage_mean": 7.450580818968433e-09, + "advantage_min": -0.9513845965266228, + "advantage_std": 0.9998447820544243, + "completion_length": 1014.9792098999023, + "epoch": 0.28685714285714287, + "grad_norm": 0.3035866916179657, + "kl": 0.011281967163085938, + "lambda_div_used": 0.6, + "learning_rate": 6.25045936022246e-07, + "loss": 0.0005, + "reward": 0.3110234094783664, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3110234094783664, + "reward_after_std": 0.7761142998933792, + "reward_before_mean": 0.8676313664764166, + "reward_before_std": 0.6817162893712521, + "reward_change_max": 0.0, + "reward_change_mean": -0.5566079169511795, + "reward_change_min": -0.8789765313267708, + "reward_change_std": 0.3465210497379303, + "reward_std": 0.7761143408715725, + "rewards/cosine_scaled_reward": -0.024517669342458248, + "rewards/format_reward": 0.916666679084301, + "step": 251 + }, + { + "advantage_max": 1.8302258551120758, + "advantage_mean": 1.4280280291600889e-08, + "advantage_min": -0.8962804041802883, + "advantage_std": 0.9998032450675964, + "completion_length": 1662.7291984558105, + "epoch": 0.288, + "grad_norm": 0.2449835240840912, + "kl": 0.009859085083007812, + "lambda_div_used": 0.6, + "learning_rate": 6.219465344613258e-07, + "loss": 0.0004, + "reward": 0.0912809963338077, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.0912809963338077, + "reward_after_std": 0.5700275152921677, + "reward_before_mean": 0.5716589528601617, + "reward_before_std": 0.47574319737032056, + "reward_change_max": 0.0004999637603759766, + "reward_change_mean": -0.4803779609501362, + "reward_change_min": -0.7776023708283901, + "reward_change_std": 0.29937842302024364, + "reward_std": 0.5700275264680386, + "rewards/cosine_scaled_reward": -0.11000386101659387, + "rewards/format_reward": 0.7916666716337204, + "step": 252 + }, + { + "advantage_max": 1.9588626027107239, + "advantage_mean": 2.980232283178452e-08, + "advantage_min": -0.6859035342931747, + "advantage_std": 0.9997961297631264, + "completion_length": 1476.979206085205, + "epoch": 0.28914285714285715, + "grad_norm": 0.2885993421077728, + "kl": 0.00954437255859375, + "lambda_div_used": 0.6, + "learning_rate": 6.188436263278172e-07, + "loss": 0.0004, + "reward": 0.023032560478895903, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.023032560478895903, + "reward_after_std": 0.6161117516458035, + "reward_before_mean": 0.44603691063821316, + "reward_before_std": 0.4674696382135153, + "reward_change_max": 0.0, + "reward_change_mean": -0.423004312440753, + "reward_change_min": -0.6526531614363194, + "reward_change_std": 0.236457671970129, + "reward_std": 0.6161117758601904, + "rewards/cosine_scaled_reward": -0.18323156610131264, + "rewards/format_reward": 0.8125000074505806, + "step": 253 + }, + { + "advantage_max": 1.9421354234218597, + "advantage_mean": 7.450580818968433e-09, + "advantage_min": -0.7857317849993706, + "advantage_std": 0.9998878464102745, + "completion_length": 1576.2083740234375, + "epoch": 0.29028571428571426, + "grad_norm": 0.28780651092529297, + "kl": 0.00881195068359375, + "lambda_div_used": 0.6, + "learning_rate": 6.157373628530852e-07, + "loss": 0.0004, + "reward": 0.32410276448354125, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.32410276448354125, + "reward_after_std": 1.066658116877079, + "reward_before_mean": 0.8170572388917208, + "reward_before_std": 0.9517768397927284, + "reward_change_max": 0.00031269341707229614, + "reward_change_mean": -0.4929545000195503, + "reward_change_min": -0.8555713668465614, + "reward_change_std": 0.3223307225853205, + "reward_std": 1.0666581466794014, + "rewards/cosine_scaled_reward": -0.00813805649522692, + "rewards/format_reward": 0.8333333432674408, + "step": 254 + }, + { + "advantage_max": 1.8271007537841797, + "advantage_mean": 1.7229468074475562e-08, + "advantage_min": -0.8999714367091656, + "advantage_std": 0.9998132511973381, + "completion_length": 1964.8542022705078, + "epoch": 0.2914285714285714, + "grad_norm": 0.2708861231803894, + "kl": 0.012363433837890625, + "lambda_div_used": 0.6, + "learning_rate": 6.126278954320294e-07, + "loss": 0.0005, + "reward": -0.03212953475303948, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.03212953475303948, + "reward_after_std": 0.7982987090945244, + "reward_before_mean": 0.33184120431542397, + "reward_before_std": 0.7744866535067558, + "reward_change_max": 0.0005225986242294312, + "reward_change_mean": -0.3639707425609231, + "reward_change_min": -0.6983498111367226, + "reward_change_std": 0.28055456932634115, + "reward_std": 0.7982987351715565, + "rewards/cosine_scaled_reward": -0.19866274809464812, + "rewards/format_reward": 0.729166679084301, + "step": 255 + }, + { + "advantage_max": 1.8787826746702194, + "advantage_mean": -6.208817904251873e-10, + "advantage_min": -0.8882212191820145, + "advantage_std": 0.999819703400135, + "completion_length": 1394.7500381469727, + "epoch": 0.2925714285714286, + "grad_norm": 0.2527642548084259, + "kl": 0.007617950439453125, + "lambda_div_used": 0.6, + "learning_rate": 6.095153756157051e-07, + "loss": 0.0003, + "reward": 0.3247837144881487, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3247837144881487, + "reward_after_std": 0.675804752856493, + "reward_before_mean": 0.9066541530191898, + "reward_before_std": 0.5500271543860435, + "reward_change_max": 0.0, + "reward_change_mean": -0.5818704478442669, + "reward_change_min": -0.9174501821398735, + "reward_change_std": 0.33812401071190834, + "reward_std": 0.675804790109396, + "rewards/cosine_scaled_reward": -0.005006253952160478, + "rewards/format_reward": 0.916666679084301, + "step": 256 + }, + { + "advantage_max": 1.8178119510412216, + "advantage_mean": -3.0423205787943886e-08, + "advantage_min": -0.8814026415348053, + "advantage_std": 0.9998690858483315, + "completion_length": 1931.4583892822266, + "epoch": 0.2937142857142857, + "grad_norm": 0.21259228885173798, + "kl": 0.008331298828125, + "lambda_div_used": 0.6, + "learning_rate": 6.06399955103937e-07, + "loss": 0.0003, + "reward": 0.4000783711671829, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4000783711671829, + "reward_after_std": 0.8711743615567684, + "reward_before_mean": 0.9961195886135101, + "reward_before_std": 0.8301573395729065, + "reward_change_max": 0.0, + "reward_change_mean": -0.5960412230342627, + "reward_change_min": -1.0773316584527493, + "reward_change_std": 0.4147396646440029, + "reward_std": 0.8711744025349617, + "rewards/cosine_scaled_reward": 0.11264310358092189, + "rewards/format_reward": 0.7708333488553762, + "step": 257 + }, + { + "advantage_max": 1.865677148103714, + "advantage_mean": -9.313225912688239e-09, + "advantage_min": -0.8630052953958511, + "advantage_std": 0.9998342022299767, + "completion_length": 1912.4167098999023, + "epoch": 0.2948571428571429, + "grad_norm": 0.23722055554389954, + "kl": 0.00826263427734375, + "lambda_div_used": 0.6, + "learning_rate": 6.032817857379256e-07, + "loss": 0.0003, + "reward": 0.11609090398997068, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.11609090398997068, + "reward_after_std": 0.7722814045846462, + "reward_before_mean": 0.5685428031720221, + "reward_before_std": 0.7202793788164854, + "reward_change_max": 0.001006096601486206, + "reward_change_mean": -0.45245190896093845, + "reward_change_min": -0.8036795221269131, + "reward_change_std": 0.31898451782763004, + "reward_std": 0.7722814530134201, + "rewards/cosine_scaled_reward": -0.10114526422694325, + "rewards/format_reward": 0.7708333414047956, + "step": 258 + }, + { + "advantage_max": 1.9082180708646774, + "advantage_mean": -2.4214387606136256e-08, + "advantage_min": -0.8204468339681625, + "advantage_std": 0.9998528063297272, + "completion_length": 1272.5833625793457, + "epoch": 0.296, + "grad_norm": 0.28688231110572815, + "kl": 0.009775161743164062, + "lambda_div_used": 0.6, + "learning_rate": 6.001610194928464e-07, + "loss": 0.0004, + "reward": 0.4570096703246236, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4570096703246236, + "reward_after_std": 0.7713707610964775, + "reward_before_mean": 1.09649408608675, + "reward_before_std": 0.6217096205800772, + "reward_change_max": 0.0, + "reward_change_mean": -0.6394844427704811, + "reward_change_min": -0.9701583310961723, + "reward_change_std": 0.374478030949831, + "reward_std": 0.771370779722929, + "rewards/cosine_scaled_reward": 0.07949701510369778, + "rewards/format_reward": 0.9375000074505806, + "step": 259 + }, + { + "advantage_max": 1.828286275267601, + "advantage_mean": -1.9247333282734758e-08, + "advantage_min": -0.9948167651891708, + "advantage_std": 0.9998681172728539, + "completion_length": 1070.5000267028809, + "epoch": 0.29714285714285715, + "grad_norm": 0.3003004193305969, + "kl": 0.007595062255859375, + "lambda_div_used": 0.6, + "learning_rate": 5.97037808470444e-07, + "loss": 0.0003, + "reward": 0.4869587696157396, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4869587696157396, + "reward_after_std": 0.8302631750702858, + "reward_before_mean": 1.1328422725200653, + "reward_before_std": 0.7522343434393406, + "reward_change_max": 0.0, + "reward_change_mean": -0.6458835043013096, + "reward_change_min": -1.0198750123381615, + "reward_change_std": 0.39399000257253647, + "reward_std": 0.8302631825208664, + "rewards/cosine_scaled_reward": 0.09767113672569394, + "rewards/format_reward": 0.9375, + "step": 260 + }, + { + "advantage_max": 1.8792397528886795, + "advantage_mean": 1.0554989549049765e-08, + "advantage_min": -0.8463708981871605, + "advantage_std": 0.9997694864869118, + "completion_length": 2166.270851135254, + "epoch": 0.29828571428571427, + "grad_norm": 0.20295003056526184, + "kl": 0.007968902587890625, + "lambda_div_used": 0.6, + "learning_rate": 5.939123048916173e-07, + "loss": 0.0003, + "reward": 0.02313760167453438, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.02313760167453438, + "reward_after_std": 0.5842309631407261, + "reward_before_mean": 0.46202369034290314, + "reward_before_std": 0.5016245627775788, + "reward_change_max": 0.0, + "reward_change_mean": -0.43888608552515507, + "reward_change_min": -0.7771935053169727, + "reward_change_std": 0.27092802058905363, + "reward_std": 0.5842309780418873, + "rewards/cosine_scaled_reward": -0.09190484462305903, + "rewards/format_reward": 0.645833333954215, + "step": 261 + }, + { + "advantage_max": 1.828007161617279, + "advantage_mean": -1.955777406692505e-08, + "advantage_min": -0.8678448721766472, + "advantage_std": 0.9998091235756874, + "completion_length": 1643.0625305175781, + "epoch": 0.29942857142857143, + "grad_norm": 0.28205162286758423, + "kl": 0.0093994140625, + "lambda_div_used": 0.6, + "learning_rate": 5.907846610890011e-07, + "loss": 0.0004, + "reward": -0.08022703300230205, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.08022703300230205, + "reward_after_std": 0.5777127407491207, + "reward_before_mean": 0.2998348996043205, + "reward_before_std": 0.5390517674386501, + "reward_change_max": 0.0, + "reward_change_mean": -0.3800619672983885, + "reward_change_min": -0.6844152100384235, + "reward_change_std": 0.2551191672682762, + "reward_std": 0.577712744474411, + "rewards/cosine_scaled_reward": -0.2250825520604849, + "rewards/format_reward": 0.7500000093132257, + "step": 262 + }, + { + "advantage_max": 1.9083099365234375, + "advantage_mean": -1.986821573929376e-08, + "advantage_min": -0.8035422787070274, + "advantage_std": 0.9997920989990234, + "completion_length": 1199.625015258789, + "epoch": 0.30057142857142854, + "grad_norm": 0.24454443156719208, + "kl": 0.006107330322265625, + "lambda_div_used": 0.6, + "learning_rate": 5.87655029499542e-07, + "loss": 0.0002, + "reward": 0.10955283138900995, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.10955283138900995, + "reward_after_std": 0.5718753635883331, + "reward_before_mean": 0.5939783789217472, + "reward_before_std": 0.45454465225338936, + "reward_change_max": 0.0, + "reward_change_mean": -0.4844255559146404, + "reward_change_min": -0.7876281961798668, + "reward_change_std": 0.28957670740783215, + "reward_std": 0.5718753710389137, + "rewards/cosine_scaled_reward": -0.17176082776859403, + "rewards/format_reward": 0.9375, + "step": 263 + }, + { + "advantage_max": 1.8229975551366806, + "advantage_mean": -8.84756468089165e-09, + "advantage_min": -0.9802969917654991, + "advantage_std": 0.999835766851902, + "completion_length": 1350.2916946411133, + "epoch": 0.3017142857142857, + "grad_norm": 0.23903900384902954, + "kl": 0.006744384765625, + "lambda_div_used": 0.6, + "learning_rate": 5.845235626570683e-07, + "loss": 0.0003, + "reward": 0.22060711891390383, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.22060711891390383, + "reward_after_std": 0.6753030680119991, + "reward_before_mean": 0.7518087485805154, + "reward_before_std": 0.6103526018559933, + "reward_change_max": 0.0, + "reward_change_mean": -0.5312016159296036, + "reward_change_min": -0.9094065576791763, + "reward_change_std": 0.33360027708113194, + "reward_std": 0.6753031127154827, + "rewards/cosine_scaled_reward": -0.07201230898499489, + "rewards/format_reward": 0.8958333395421505, + "step": 264 + }, + { + "advantage_max": 1.8557786792516708, + "advantage_mean": -1.7384688466570708e-08, + "advantage_min": -0.8617202043533325, + "advantage_std": 0.9998458698391914, + "completion_length": 1349.3750305175781, + "epoch": 0.3028571428571429, + "grad_norm": 0.259480744600296, + "kl": 0.00733184814453125, + "lambda_div_used": 0.6, + "learning_rate": 5.813904131848564e-07, + "loss": 0.0003, + "reward": 0.5593267162330449, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5593267162330449, + "reward_after_std": 0.7222743555903435, + "reward_before_mean": 1.2658403292298317, + "reward_before_std": 0.5619801264256239, + "reward_change_max": 0.0, + "reward_change_mean": -0.7065136171877384, + "reward_change_min": -1.0326689183712006, + "reward_change_std": 0.401592755690217, + "reward_std": 0.7222743816673756, + "rewards/cosine_scaled_reward": 0.1537534836679697, + "rewards/format_reward": 0.9583333358168602, + "step": 265 + }, + { + "advantage_max": 1.775598168373108, + "advantage_mean": -1.1796753240922442e-08, + "advantage_min": -1.0651524811983109, + "advantage_std": 0.9998323395848274, + "completion_length": 1781.8333740234375, + "epoch": 0.304, + "grad_norm": 0.2434740513563156, + "kl": 0.00922393798828125, + "lambda_div_used": 0.6, + "learning_rate": 5.78255733788191e-07, + "loss": 0.0004, + "reward": 0.1264530853368342, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1264530853368342, + "reward_after_std": 0.7009001858532429, + "reward_before_mean": 0.6023802892304957, + "reward_before_std": 0.6759253405034542, + "reward_change_max": 9.702146053314209e-05, + "reward_change_mean": -0.4759272299706936, + "reward_change_min": -0.7876431345939636, + "reward_change_std": 0.3144048321992159, + "reward_std": 0.7009001895785332, + "rewards/cosine_scaled_reward": -0.10505986865609884, + "rewards/format_reward": 0.812500013038516, + "step": 266 + }, + { + "advantage_max": 1.9578577429056168, + "advantage_mean": 2.3593505260599557e-08, + "advantage_min": -0.7362656965851784, + "advantage_std": 0.99981639534235, + "completion_length": 1887.8541946411133, + "epoch": 0.30514285714285716, + "grad_norm": 0.2461211085319519, + "kl": 0.010284423828125, + "lambda_div_used": 0.6, + "learning_rate": 5.751196772469237e-07, + "loss": 0.0004, + "reward": -0.1160497977398336, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.1160497977398336, + "reward_after_std": 0.6473545543849468, + "reward_before_mean": 0.2215914549306035, + "reward_before_std": 0.5311122592538595, + "reward_change_max": 0.0, + "reward_change_mean": -0.33764123916625977, + "reward_change_min": -0.5379331484436989, + "reward_change_std": 0.19541364442557096, + "reward_std": 0.6473545767366886, + "rewards/cosine_scaled_reward": -0.23295428697019815, + "rewards/format_reward": 0.6875000055879354, + "step": 267 + }, + { + "advantage_max": 1.9218407273292542, + "advantage_mean": -2.793967879277659e-08, + "advantage_min": -0.8416651785373688, + "advantage_std": 0.9998409524559975, + "completion_length": 1430.9791870117188, + "epoch": 0.3062857142857143, + "grad_norm": 0.2900254428386688, + "kl": 0.01381683349609375, + "lambda_div_used": 0.6, + "learning_rate": 5.71982396408026e-07, + "loss": 0.0006, + "reward": 0.20641274470835924, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.20641274470835924, + "reward_after_std": 0.7826429642736912, + "reward_before_mean": 0.6979242414236069, + "reward_before_std": 0.6802052427083254, + "reward_change_max": 0.0005292147397994995, + "reward_change_mean": -0.4915114976465702, + "reward_change_min": -0.8061510771512985, + "reward_change_std": 0.30862017907202244, + "reward_std": 0.7826430015265942, + "rewards/cosine_scaled_reward": -0.06770455720834434, + "rewards/format_reward": 0.8333333395421505, + "step": 268 + }, + { + "advantage_max": 1.86625137925148, + "advantage_mean": -6.829699250587851e-09, + "advantage_min": -0.8370335847139359, + "advantage_std": 0.9998442083597183, + "completion_length": 1545.520866394043, + "epoch": 0.30742857142857144, + "grad_norm": 0.22027085721492767, + "kl": 0.008152008056640625, + "lambda_div_used": 0.6, + "learning_rate": 5.688440441781398e-07, + "loss": 0.0003, + "reward": 0.21930112247355282, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.21930112247355282, + "reward_after_std": 0.7824936024844646, + "reward_before_mean": 0.7232769569382071, + "reward_before_std": 0.67463708226569, + "reward_change_max": 0.00043030083179473877, + "reward_change_mean": -0.5039758523926139, + "reward_change_min": -0.8153284899890423, + "reward_change_std": 0.31488157622516155, + "reward_std": 0.7824936211109161, + "rewards/cosine_scaled_reward": -0.044611528515815735, + "rewards/format_reward": 0.8125000074505806, + "step": 269 + }, + { + "advantage_max": 1.9493401795625687, + "advantage_mean": -1.862645193639878e-08, + "advantage_min": -0.6977572962641716, + "advantage_std": 0.9998801946640015, + "completion_length": 1448.583381652832, + "epoch": 0.30857142857142855, + "grad_norm": 0.20074597001075745, + "kl": 0.0062274932861328125, + "lambda_div_used": 0.6, + "learning_rate": 5.657047735161255e-07, + "loss": 0.0002, + "reward": 0.5448911990970373, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5448911990970373, + "reward_after_std": 0.9793307185173035, + "reward_before_mean": 1.1831915229558945, + "reward_before_std": 0.7954367697238922, + "reward_change_max": 0.0, + "reward_change_mean": -0.6383003033697605, + "reward_change_min": -1.0107336267828941, + "reward_change_std": 0.3763220179826021, + "reward_std": 0.9793307483196259, + "rewards/cosine_scaled_reward": 0.11242906516417861, + "rewards/format_reward": 0.9583333358168602, + "step": 270 + }, + { + "advantage_max": 1.9000986367464066, + "advantage_mean": -1.1098261587516589e-08, + "advantage_min": -0.8887056112289429, + "advantage_std": 0.9998585283756256, + "completion_length": 1232.270851135254, + "epoch": 0.3097142857142857, + "grad_norm": 0.2368963658809662, + "kl": 0.008569717407226562, + "lambda_div_used": 0.6, + "learning_rate": 5.625647374256061e-07, + "loss": 0.0003, + "reward": 0.6372459325939417, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.6372459325939417, + "reward_after_std": 0.8309118635952473, + "reward_before_mean": 1.3628186136484146, + "reward_before_std": 0.6561471298336983, + "reward_change_max": 0.0003469809889793396, + "reward_change_mean": -0.725572694092989, + "reward_change_min": -1.076353020966053, + "reward_change_std": 0.4148294348269701, + "reward_std": 0.8309118933975697, + "rewards/cosine_scaled_reward": 0.23349263006821275, + "rewards/format_reward": 0.8958333395421505, + "step": 271 + }, + { + "advantage_max": 1.8306615948677063, + "advantage_mean": 1.3659398168108794e-08, + "advantage_min": -0.8924771621823311, + "advantage_std": 0.9998399615287781, + "completion_length": 1688.9583740234375, + "epoch": 0.31085714285714283, + "grad_norm": 0.22661247849464417, + "kl": 0.00933837890625, + "lambda_div_used": 0.6, + "learning_rate": 5.594240889475106e-07, + "loss": 0.0004, + "reward": 0.17267357744276524, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.17267357744276524, + "reward_after_std": 0.7733958214521408, + "reward_before_mean": 0.6572124548256397, + "reward_before_std": 0.7261578030884266, + "reward_change_max": 0.00031144171953201294, + "reward_change_mean": -0.48453884944319725, + "reward_change_min": -0.8327630683779716, + "reward_change_std": 0.3227216098457575, + "reward_std": 0.7733958400785923, + "rewards/cosine_scaled_reward": -0.08806046470999718, + "rewards/format_reward": 0.8333333358168602, + "step": 272 + }, + { + "advantage_max": 1.8778915852308273, + "advantage_mean": -1.614292477469803e-08, + "advantage_min": -0.9554785639047623, + "advantage_std": 0.9998368248343468, + "completion_length": 1353.0000305175781, + "epoch": 0.312, + "grad_norm": 0.24040770530700684, + "kl": 0.008619308471679688, + "lambda_div_used": 0.6, + "learning_rate": 5.562829811526154e-07, + "loss": 0.0003, + "reward": 0.39927546092076227, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.39927546092076227, + "reward_after_std": 0.736445277929306, + "reward_before_mean": 1.0131700951606035, + "reward_before_std": 0.6133726499974728, + "reward_change_max": 0.0, + "reward_change_mean": -0.613894646987319, + "reward_change_min": -0.9457522034645081, + "reward_change_std": 0.35532393865287304, + "reward_std": 0.7364452891051769, + "rewards/cosine_scaled_reward": 0.07950170524418354, + "rewards/format_reward": 0.8541666772216558, + "step": 273 + }, + { + "advantage_max": 1.882187381386757, + "advantage_mean": -4.967053879312289e-09, + "advantage_min": -0.848646879196167, + "advantage_std": 0.9998696744441986, + "completion_length": 888.4791793823242, + "epoch": 0.31314285714285717, + "grad_norm": 0.26737749576568604, + "kl": 0.007991790771484375, + "lambda_div_used": 0.6, + "learning_rate": 5.531415671340826e-07, + "loss": 0.0003, + "reward": 0.5258622104302049, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5258622104302049, + "reward_after_std": 0.9269070662558079, + "reward_before_mean": 1.1696142181754112, + "reward_before_std": 0.8161358386278152, + "reward_change_max": 0.0, + "reward_change_mean": -0.643751971423626, + "reward_change_min": -1.0380645543336868, + "reward_change_std": 0.38927505910396576, + "reward_std": 0.926907118409872, + "rewards/cosine_scaled_reward": 0.09522374533116817, + "rewards/format_reward": 0.9791666716337204, + "step": 274 + }, + { + "advantage_max": 1.877179205417633, + "advantage_mean": -3.7252901874396116e-09, + "advantage_min": -0.870720274746418, + "advantage_std": 0.9998558908700943, + "completion_length": 1558.8958740234375, + "epoch": 0.3142857142857143, + "grad_norm": 0.21030379831790924, + "kl": 0.00858306884765625, + "lambda_div_used": 0.6, + "learning_rate": 5.5e-07, + "loss": 0.0003, + "reward": 0.5669933171011508, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.5669933171011508, + "reward_after_std": 0.8992337211966515, + "reward_before_mean": 1.2377357706427574, + "reward_before_std": 0.7513070069253445, + "reward_change_max": 0.0002499818801879883, + "reward_change_mean": -0.6707424111664295, + "reward_change_min": -1.0658031031489372, + "reward_change_std": 0.40347418934106827, + "reward_std": 0.8992337286472321, + "rewards/cosine_scaled_reward": 0.23345119040459394, + "rewards/format_reward": 0.7708333395421505, + "step": 275 + }, + { + "advantage_max": 1.909448117017746, + "advantage_mean": -2.1109978876054925e-08, + "advantage_min": -0.7440906390547752, + "advantage_std": 0.9998734965920448, + "completion_length": 1220.9583740234375, + "epoch": 0.31542857142857145, + "grad_norm": 0.2865343689918518, + "kl": 0.0120086669921875, + "lambda_div_used": 0.6, + "learning_rate": 5.468584328659172e-07, + "loss": 0.0005, + "reward": 0.4875176604837179, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4875176604837179, + "reward_after_std": 0.8809028156101704, + "reward_before_mean": 1.1188461929559708, + "reward_before_std": 0.7294301693327725, + "reward_change_max": 0.0, + "reward_change_mean": -0.6313285455107689, + "reward_change_min": -1.0723997987806797, + "reward_change_std": 0.39141695387661457, + "reward_std": 0.8809028305113316, + "rewards/cosine_scaled_reward": 0.11150642792927101, + "rewards/format_reward": 0.8958333395421505, + "step": 276 + }, + { + "advantage_max": 1.8797271996736526, + "advantage_mean": 1.5832484323574647e-08, + "advantage_min": -0.8954315483570099, + "advantage_std": 0.999833457171917, + "completion_length": 1340.333351135254, + "epoch": 0.31657142857142856, + "grad_norm": 0.2681465148925781, + "kl": 0.009929656982421875, + "lambda_div_used": 0.6, + "learning_rate": 5.437170188473847e-07, + "loss": 0.0004, + "reward": 0.42410989105701447, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.42410989105701447, + "reward_after_std": 0.7413080781698227, + "reward_before_mean": 1.0489613488316536, + "reward_before_std": 0.6017416436225176, + "reward_change_max": 0.0, + "reward_change_mean": -0.6248514233157039, + "reward_change_min": -0.9577978886663914, + "reward_change_std": 0.3677874878048897, + "reward_std": 0.741308081895113, + "rewards/cosine_scaled_reward": 0.09739732113666832, + "rewards/format_reward": 0.8541666716337204, + "step": 277 + }, + { + "advantage_max": 1.9071197807788849, + "advantage_mean": 1.7384688799637615e-08, + "advantage_min": -0.8232261501252651, + "advantage_std": 0.9998288676142693, + "completion_length": 1538.6666870117188, + "epoch": 0.3177142857142857, + "grad_norm": 0.2890927791595459, + "kl": 0.007373809814453125, + "lambda_div_used": 0.6, + "learning_rate": 5.405759110524894e-07, + "loss": 0.0003, + "reward": 0.3901700456626713, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3901700456626713, + "reward_after_std": 0.6501722633838654, + "reward_before_mean": 1.0097860351670533, + "reward_before_std": 0.47574078012257814, + "reward_change_max": 0.0, + "reward_change_mean": -0.6196159720420837, + "reward_change_min": -0.9255659654736519, + "reward_change_std": 0.35610699094831944, + "reward_std": 0.6501722633838654, + "rewards/cosine_scaled_reward": 0.07780967373400927, + "rewards/format_reward": 0.8541666679084301, + "step": 278 + }, + { + "advantage_max": 1.943043828010559, + "advantage_mean": -9.934107647602275e-09, + "advantage_min": -0.7475237399339676, + "advantage_std": 0.9998511970043182, + "completion_length": 1437.1666870117188, + "epoch": 0.31885714285714284, + "grad_norm": 0.21998536586761475, + "kl": 0.009319305419921875, + "lambda_div_used": 0.6, + "learning_rate": 5.37435262574394e-07, + "loss": 0.0004, + "reward": 0.2873132990207523, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2873132990207523, + "reward_after_std": 0.803663358092308, + "reward_before_mean": 0.8204641807824373, + "reward_before_std": 0.6527150310575962, + "reward_change_max": 0.0, + "reward_change_mean": -0.533150888979435, + "reward_change_min": -0.8645448237657547, + "reward_change_std": 0.3154606595635414, + "reward_std": 0.8036633767187595, + "rewards/cosine_scaled_reward": -0.037684588925912976, + "rewards/format_reward": 0.8958333432674408, + "step": 279 + }, + { + "advantage_max": 1.8220892250537872, + "advantage_mean": -3.042320539936583e-08, + "advantage_min": -0.9527215585112572, + "advantage_std": 0.999868743121624, + "completion_length": 1730.9792175292969, + "epoch": 0.32, + "grad_norm": 0.38605308532714844, + "kl": 0.01398468017578125, + "lambda_div_used": 0.6, + "learning_rate": 5.342952264838747e-07, + "loss": 0.0006, + "reward": 0.5508183864876628, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5508183864876628, + "reward_after_std": 0.9038041196763515, + "reward_before_mean": 1.2189355352893472, + "reward_before_std": 0.8520298786461353, + "reward_change_max": 0.0009787678718566895, + "reward_change_mean": -0.6681171432137489, + "reward_change_min": -1.149309366941452, + "reward_change_std": 0.4476726073771715, + "reward_std": 0.903804138302803, + "rewards/cosine_scaled_reward": 0.1928010657429695, + "rewards/format_reward": 0.8333333395421505, + "step": 280 + }, + { + "advantage_max": 1.9217159450054169, + "advantage_mean": 1.9247333948868572e-08, + "advantage_min": -0.775476261973381, + "advantage_std": 0.9998063147068024, + "completion_length": 2434.8541870117188, + "epoch": 0.3211428571428571, + "grad_norm": 0.2845155894756317, + "kl": 0.012725830078125, + "lambda_div_used": 0.6, + "learning_rate": 5.311559558218603e-07, + "loss": 0.0005, + "reward": -0.05798890208825469, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.05798890208825469, + "reward_after_std": 0.7850078567862511, + "reward_before_mean": 0.28525743540376425, + "reward_before_std": 0.6987994946539402, + "reward_change_max": 0.0, + "reward_change_mean": -0.34324632212519646, + "reward_change_min": -0.635148648172617, + "reward_change_std": 0.23209808766841888, + "reward_std": 0.7850079126656055, + "rewards/cosine_scaled_reward": -0.149037959985435, + "rewards/format_reward": 0.5833333358168602, + "step": 281 + }, + { + "advantage_max": 1.8430883884429932, + "advantage_mean": 8.07146260939362e-09, + "advantage_min": -0.9225859716534615, + "advantage_std": 0.9998578503727913, + "completion_length": 1281.3333702087402, + "epoch": 0.3222857142857143, + "grad_norm": 0.26052939891815186, + "kl": 0.009288787841796875, + "lambda_div_used": 0.6, + "learning_rate": 5.28017603591974e-07, + "loss": 0.0004, + "reward": 0.29589394642971456, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.29589394642971456, + "reward_after_std": 0.7819314710795879, + "reward_before_mean": 0.8467987608164549, + "reward_before_std": 0.6957082394510508, + "reward_change_max": 0.0, + "reward_change_mean": -0.5509047862142324, + "reward_change_min": -0.938206635415554, + "reward_change_std": 0.3565885201096535, + "reward_std": 0.7819315008819103, + "rewards/cosine_scaled_reward": -0.014100641012191772, + "rewards/format_reward": 0.8750000149011612, + "step": 282 + }, + { + "advantage_max": 1.8591952323913574, + "advantage_mean": -1.2417634032146907e-08, + "advantage_min": -0.8575943261384964, + "advantage_std": 0.9998741522431374, + "completion_length": 2172.0625228881836, + "epoch": 0.32342857142857145, + "grad_norm": 0.18784987926483154, + "kl": 0.0093994140625, + "lambda_div_used": 0.6, + "learning_rate": 5.248803227530763e-07, + "loss": 0.0004, + "reward": 0.4316023401916027, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4316023401916027, + "reward_after_std": 0.9680920131504536, + "reward_before_mean": 1.013820081949234, + "reward_before_std": 0.8663015514612198, + "reward_change_max": 0.00028090178966522217, + "reward_change_mean": -0.5822177156805992, + "reward_change_min": -1.0923729129135609, + "reward_change_std": 0.39628793857991695, + "reward_std": 0.9680920168757439, + "rewards/cosine_scaled_reward": 0.1319100260734558, + "rewards/format_reward": 0.7500000037252903, + "step": 283 + }, + { + "advantage_max": 1.9308781772851944, + "advantage_mean": 1.862645426786713e-09, + "advantage_min": -0.7617431655526161, + "advantage_std": 0.9998421370983124, + "completion_length": 1170.3542022705078, + "epoch": 0.32457142857142857, + "grad_norm": 0.26644590497016907, + "kl": 0.009359359741210938, + "lambda_div_used": 0.6, + "learning_rate": 5.21744266211809e-07, + "loss": 0.0004, + "reward": 0.2046554802218452, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2046554802218452, + "reward_after_std": 0.8015350475907326, + "reward_before_mean": 0.688809160143137, + "reward_before_std": 0.6861891858279705, + "reward_change_max": 0.0, + "reward_change_mean": -0.4841536581516266, + "reward_change_min": -0.8655692115426064, + "reward_change_std": 0.30566604621708393, + "reward_std": 0.8015350624918938, + "rewards/cosine_scaled_reward": -0.12434543017297983, + "rewards/format_reward": 0.9375000074505806, + "step": 284 + }, + { + "advantage_max": 1.9677094668149948, + "advantage_mean": -2.110997954218874e-08, + "advantage_min": -0.7027824819087982, + "advantage_std": 0.9998104050755501, + "completion_length": 1104.5000381469727, + "epoch": 0.32571428571428573, + "grad_norm": 0.26687729358673096, + "kl": 0.009538650512695312, + "lambda_div_used": 0.6, + "learning_rate": 5.186095868151436e-07, + "loss": 0.0004, + "reward": 0.2683999980799854, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2683999980799854, + "reward_after_std": 0.6404694318771362, + "reward_before_mean": 0.8243867550045252, + "reward_before_std": 0.4349752515554428, + "reward_change_max": 0.0, + "reward_change_mean": -0.5559867545962334, + "reward_change_min": -0.7854268550872803, + "reward_change_std": 0.300765099003911, + "reward_std": 0.6404694765806198, + "rewards/cosine_scaled_reward": -0.056556637631729245, + "rewards/format_reward": 0.9375000074505806, + "step": 285 + }, + { + "advantage_max": 1.8454220443964005, + "advantage_mean": 3.7252904094842165e-09, + "advantage_min": -1.0381435677409172, + "advantage_std": 0.99984922260046, + "completion_length": 1362.6667175292969, + "epoch": 0.32685714285714285, + "grad_norm": 0.26756566762924194, + "kl": 0.01204681396484375, + "lambda_div_used": 0.6, + "learning_rate": 5.154764373429315e-07, + "loss": 0.0005, + "reward": 0.24126607986545423, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.24126607986545423, + "reward_after_std": 0.777409877628088, + "reward_before_mean": 0.7604061793535948, + "reward_before_std": 0.7200019955635071, + "reward_change_max": 0.00024990737438201904, + "reward_change_mean": -0.5191400870680809, + "reward_change_min": -0.8817098364233971, + "reward_change_std": 0.3420996591448784, + "reward_std": 0.7774099223315716, + "rewards/cosine_scaled_reward": -0.06771359359845519, + "rewards/format_reward": 0.8958333507180214, + "step": 286 + }, + { + "advantage_max": 1.8230135142803192, + "advantage_mean": -4.4703486135055925e-08, + "advantage_min": -0.9058626368641853, + "advantage_std": 0.9998276680707932, + "completion_length": 1362.9583549499512, + "epoch": 0.328, + "grad_norm": 0.30255770683288574, + "kl": 0.01079559326171875, + "lambda_div_used": 0.6, + "learning_rate": 5.123449705004581e-07, + "loss": 0.0004, + "reward": 0.28222215245477855, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.28222215245477855, + "reward_after_std": 0.6900255009531975, + "reward_before_mean": 0.8456722013652325, + "reward_before_std": 0.6139935115352273, + "reward_change_max": 0.00031256675720214844, + "reward_change_mean": -0.5634500700980425, + "reward_change_min": -0.9192318581044674, + "reward_change_std": 0.36021300964057446, + "reward_std": 0.6900255233049393, + "rewards/cosine_scaled_reward": 0.027002752758562565, + "rewards/format_reward": 0.7916666734963655, + "step": 287 + }, + { + "advantage_max": 1.8479500263929367, + "advantage_mean": 1.490116185998147e-08, + "advantage_min": -0.8085650354623795, + "advantage_std": 0.9998614192008972, + "completion_length": 1352.2500381469727, + "epoch": 0.3291428571428571, + "grad_norm": 0.21222001314163208, + "kl": 0.009647369384765625, + "lambda_div_used": 0.6, + "learning_rate": 5.09215338910999e-07, + "loss": 0.0004, + "reward": 0.38232562225311995, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.38232562225311995, + "reward_after_std": 0.8447245806455612, + "reward_before_mean": 0.9674149639904499, + "reward_before_std": 0.7541538216173649, + "reward_change_max": 0.0004512891173362732, + "reward_change_mean": -0.5850893221795559, + "reward_change_min": -1.0319984778761864, + "reward_change_std": 0.38284142315387726, + "reward_std": 0.8447245843708515, + "rewards/cosine_scaled_reward": 0.014957469655200839, + "rewards/format_reward": 0.9375000074505806, + "step": 288 + }, + { + "advantage_max": 1.9135861545801163, + "advantage_mean": 1.5522040874849097e-09, + "advantage_min": -0.7802659943699837, + "advantage_std": 0.9997969791293144, + "completion_length": 1395.208339691162, + "epoch": 0.3302857142857143, + "grad_norm": 0.3205528259277344, + "kl": 0.0114288330078125, + "lambda_div_used": 0.6, + "learning_rate": 5.060876951083828e-07, + "loss": 0.0005, + "reward": 0.2357093554455787, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.2357093554455787, + "reward_after_std": 0.609809685498476, + "reward_before_mean": 0.7810433376580477, + "reward_before_std": 0.4604929033666849, + "reward_change_max": 0.0005588680505752563, + "reward_change_mean": -0.5453339908272028, + "reward_change_min": -0.8528439849615097, + "reward_change_std": 0.3111587315797806, + "reward_std": 0.6098097078502178, + "rewards/cosine_scaled_reward": -0.026145002455450594, + "rewards/format_reward": 0.8333333358168602, + "step": 289 + }, + { + "advantage_max": 1.811624899506569, + "advantage_mean": -7.45058070794613e-09, + "advantage_min": -0.9065205976366997, + "advantage_std": 0.9998824968934059, + "completion_length": 1016.0625457763672, + "epoch": 0.3314285714285714, + "grad_norm": 0.31949731707572937, + "kl": 0.008113861083984375, + "lambda_div_used": 0.6, + "learning_rate": 5.02962191529556e-07, + "loss": 0.0003, + "reward": 0.5378194686491042, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5378194686491042, + "reward_after_std": 0.9473526403307915, + "reward_before_mean": 1.1925920248031616, + "reward_before_std": 0.8950722739100456, + "reward_change_max": 0.0, + "reward_change_mean": -0.65477254986763, + "reward_change_min": -1.1149347499012947, + "reward_change_std": 0.4268352948129177, + "reward_std": 0.9473526403307915, + "rewards/cosine_scaled_reward": 0.13796266820281744, + "rewards/format_reward": 0.9166666716337204, + "step": 290 + }, + { + "advantage_max": 1.8208087533712387, + "advantage_mean": -6.208817682207268e-09, + "advantage_min": -0.9952463805675507, + "advantage_std": 0.9998692199587822, + "completion_length": 1115.0833587646484, + "epoch": 0.3325714285714286, + "grad_norm": 0.20173849165439606, + "kl": 0.0072803497314453125, + "lambda_div_used": 0.6, + "learning_rate": 4.998389805071536e-07, + "loss": 0.0003, + "reward": 0.4247822118923068, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4247822118923068, + "reward_after_std": 0.9100354351103306, + "reward_before_mean": 1.0233391895890236, + "reward_before_std": 0.8653226383030415, + "reward_change_max": 0.0, + "reward_change_mean": -0.5985569916665554, + "reward_change_min": -1.0455361381173134, + "reward_change_std": 0.4032821226865053, + "reward_std": 0.9100354537367821, + "rewards/cosine_scaled_reward": 0.04291958408430219, + "rewards/format_reward": 0.9375000149011612, + "step": 291 + }, + { + "advantage_max": 1.9008750915527344, + "advantage_mean": 6.208817127095756e-09, + "advantage_min": -0.855563573539257, + "advantage_std": 0.9998486787080765, + "completion_length": 1590.5000305175781, + "epoch": 0.33371428571428574, + "grad_norm": 0.2894393503665924, + "kl": 0.0106658935546875, + "lambda_div_used": 0.6, + "learning_rate": 4.967182142620745e-07, + "loss": 0.0004, + "reward": 0.16041000466793776, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.16041000466793776, + "reward_after_std": 0.8050662241876125, + "reward_before_mean": 0.6222329139709473, + "reward_before_std": 0.7005427153781056, + "reward_change_max": 0.0, + "reward_change_mean": -0.46182290092110634, + "reward_change_min": -0.7366148829460144, + "reward_change_std": 0.2943364791572094, + "reward_std": 0.8050662279129028, + "rewards/cosine_scaled_reward": -0.10555021092295647, + "rewards/format_reward": 0.8333333395421505, + "step": 292 + }, + { + "advantage_max": 1.8485536128282547, + "advantage_mean": -4.190951696791956e-09, + "advantage_min": -0.9521011859178543, + "advantage_std": 0.9998137950897217, + "completion_length": 979.8541946411133, + "epoch": 0.33485714285714285, + "grad_norm": 0.241227388381958, + "kl": 0.008237838745117188, + "lambda_div_used": 0.6, + "learning_rate": 4.93600044896063e-07, + "loss": 0.0003, + "reward": 0.3330893259262666, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.3330893259262666, + "reward_after_std": 0.5936389490962029, + "reward_before_mean": 0.9396286457777023, + "reward_before_std": 0.48794906213879585, + "reward_change_max": 0.0, + "reward_change_mean": -0.6065393388271332, + "reward_change_min": -0.9153870195150375, + "reward_change_std": 0.3452066704630852, + "reward_std": 0.5936389714479446, + "rewards/cosine_scaled_reward": -0.019769012928009033, + "rewards/format_reward": 0.9791666716337204, + "step": 293 + }, + { + "advantage_max": 1.8205612301826477, + "advantage_mean": 9.313225635132483e-09, + "advantage_min": -0.9388800859451294, + "advantage_std": 0.999842956662178, + "completion_length": 1574.6250228881836, + "epoch": 0.336, + "grad_norm": 0.2210991382598877, + "kl": 0.010227203369140625, + "lambda_div_used": 0.6, + "learning_rate": 4.904846243842949e-07, + "loss": 0.0004, + "reward": 0.2603410785086453, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2603410785086453, + "reward_after_std": 0.780572846531868, + "reward_before_mean": 0.7907075211405754, + "reward_before_std": 0.7266894429922104, + "reward_change_max": 0.00038830190896987915, + "reward_change_mean": -0.5303664095699787, + "reward_change_min": -0.902099747210741, + "reward_change_std": 0.34440297074615955, + "reward_std": 0.7805728502571583, + "rewards/cosine_scaled_reward": -0.031729597598314285, + "rewards/format_reward": 0.8541666772216558, + "step": 294 + }, + { + "advantage_max": 1.910035789012909, + "advantage_mean": 2.8250119410433427e-08, + "advantage_min": -0.7845353484153748, + "advantage_std": 0.9998215660452843, + "completion_length": 1460.7500305175781, + "epoch": 0.33714285714285713, + "grad_norm": 0.22684507071971893, + "kl": 0.0109710693359375, + "lambda_div_used": 0.6, + "learning_rate": 4.873721045679706e-07, + "loss": 0.0004, + "reward": 0.46327299624681473, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.46327299624681473, + "reward_after_std": 0.6574340984225273, + "reward_before_mean": 1.1236709244549274, + "reward_before_std": 0.45060044154524803, + "reward_change_max": 0.0, + "reward_change_mean": -0.660397931933403, + "reward_change_min": -0.9777256101369858, + "reward_change_std": 0.3532063625752926, + "reward_std": 0.6574341058731079, + "rewards/cosine_scaled_reward": 0.11391878468566574, + "rewards/format_reward": 0.8958333358168602, + "step": 295 + }, + { + "advantage_max": 1.8386378586292267, + "advantage_mean": -4.9670543234014986e-09, + "advantage_min": -0.8036707676947117, + "advantage_std": 0.9998711794614792, + "completion_length": 1556.6667098999023, + "epoch": 0.3382857142857143, + "grad_norm": 0.287063330411911, + "kl": 0.012338638305664062, + "lambda_div_used": 0.6, + "learning_rate": 4.842626371469149e-07, + "loss": 0.0005, + "reward": 0.3221674086526036, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3221674086526036, + "reward_after_std": 0.9312387257814407, + "reward_before_mean": 0.8596511520445347, + "reward_before_std": 0.8932880535721779, + "reward_change_max": 0.0, + "reward_change_mean": -0.5374837517738342, + "reward_change_min": -0.9401872828602791, + "reward_change_std": 0.36972188390791416, + "reward_std": 0.9312387406826019, + "rewards/cosine_scaled_reward": -0.018091095611453056, + "rewards/format_reward": 0.8958333432674408, + "step": 296 + }, + { + "advantage_max": 1.873251423239708, + "advantage_mean": 2.0489097085629737e-08, + "advantage_min": -0.877208910882473, + "advantage_std": 0.9998420774936676, + "completion_length": 2009.8333740234375, + "epoch": 0.3394285714285714, + "grad_norm": 0.22784045338630676, + "kl": 0.0127410888671875, + "lambda_div_used": 0.6, + "learning_rate": 4.811563736721829e-07, + "loss": 0.0005, + "reward": 0.1964537873864174, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1964537873864174, + "reward_after_std": 0.7367817126214504, + "reward_before_mean": 0.6939435098320246, + "reward_before_std": 0.6230933777987957, + "reward_change_max": 0.0023637935519218445, + "reward_change_mean": -0.497489670291543, + "reward_change_min": -0.8366812095046043, + "reward_change_std": 0.3124427292495966, + "reward_std": 0.7367817237973213, + "rewards/cosine_scaled_reward": -0.03844492509961128, + "rewards/format_reward": 0.770833345130086, + "step": 297 + }, + { + "advantage_max": 1.9013422727584839, + "advantage_mean": -1.6453366002977532e-08, + "advantage_min": -0.8251728340983391, + "advantage_std": 0.9998217076063156, + "completion_length": 1473.8125381469727, + "epoch": 0.3405714285714286, + "grad_norm": 0.2259664386510849, + "kl": 0.009435653686523438, + "lambda_div_used": 0.6, + "learning_rate": 4.780534655386743e-07, + "loss": 0.0004, + "reward": 0.17620949761476368, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.17620949761476368, + "reward_after_std": 0.6479051597416401, + "reward_before_mean": 0.6807891931384802, + "reward_before_std": 0.5183268450200558, + "reward_change_max": 0.0, + "reward_change_mean": -0.5045796744525433, + "reward_change_min": -0.8199915066361427, + "reward_change_std": 0.3016477432101965, + "reward_std": 0.647905170917511, + "rewards/cosine_scaled_reward": -0.08668875135481358, + "rewards/format_reward": 0.8541666828095913, + "step": 298 + }, + { + "advantage_max": 1.8082835525274277, + "advantage_mean": -6.829699250587851e-09, + "advantage_min": -0.9486165568232536, + "advantage_std": 0.9998417943716049, + "completion_length": 1438.270851135254, + "epoch": 0.3417142857142857, + "grad_norm": 0.2638239562511444, + "kl": 0.01007843017578125, + "lambda_div_used": 0.6, + "learning_rate": 4.749540639777539e-07, + "loss": 0.0004, + "reward": 0.09841901052277535, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.09841901052277535, + "reward_after_std": 0.7489665485918522, + "reward_before_mean": 0.5462133912369609, + "reward_before_std": 0.7167009748518467, + "reward_change_max": 0.00038520991802215576, + "reward_change_mean": -0.4477943815290928, + "reward_change_min": -0.828592486679554, + "reward_change_std": 0.3159326184540987, + "reward_std": 0.748966570943594, + "rewards/cosine_scaled_reward": -0.17480998300015926, + "rewards/format_reward": 0.8958333432674408, + "step": 299 + }, + { + "advantage_max": 1.8891912400722504, + "advantage_mean": 8.071462831438225e-09, + "advantage_min": -0.8375274538993835, + "advantage_std": 0.999823771417141, + "completion_length": 1474.1875305175781, + "epoch": 0.34285714285714286, + "grad_norm": 0.34135720133781433, + "kl": 0.01102447509765625, + "lambda_div_used": 0.6, + "learning_rate": 4.7185832004988133e-07, + "loss": 0.0004, + "reward": 0.18832044645387214, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.18832044645387214, + "reward_after_std": 0.682534109801054, + "reward_before_mean": 0.6911065131425858, + "reward_before_std": 0.574971929192543, + "reward_change_max": 0.0005830526351928711, + "reward_change_mean": -0.5027860454283655, + "reward_change_min": -0.8069445490837097, + "reward_change_std": 0.29158598743379116, + "reward_std": 0.6825341135263443, + "rewards/cosine_scaled_reward": -0.07111341133713722, + "rewards/format_reward": 0.8333333432674408, + "step": 300 + }, + { + "advantage_max": 1.8619256168603897, + "advantage_mean": -1.6763806787167823e-08, + "advantage_min": -0.9469154067337513, + "advantage_std": 0.9998360425233841, + "completion_length": 1291.520896911621, + "epoch": 0.344, + "grad_norm": 0.30536800622940063, + "kl": 0.012241363525390625, + "lambda_div_used": 0.6, + "learning_rate": 4.68766384637248e-07, + "loss": 0.0005, + "reward": 0.3281488213688135, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3281488213688135, + "reward_after_std": 0.7333485223352909, + "reward_before_mean": 0.9018230102956295, + "reward_before_std": 0.6293862089514732, + "reward_change_max": 0.0, + "reward_change_mean": -0.5736742094159126, + "reward_change_min": -0.9062788337469101, + "reward_change_std": 0.33993304893374443, + "reward_std": 0.7333485260605812, + "rewards/cosine_scaled_reward": -0.03867182228714228, + "rewards/format_reward": 0.9791666716337204, + "step": 301 + }, + { + "advantage_max": 1.8721920400857925, + "advantage_mean": -4.967053879312289e-09, + "advantage_min": -0.7614920884370804, + "advantage_std": 0.9998584762215614, + "completion_length": 1661.166748046875, + "epoch": 0.34514285714285714, + "grad_norm": 0.33685818314552307, + "kl": 0.01244354248046875, + "lambda_div_used": 0.6, + "learning_rate": 4.656784084364238e-07, + "loss": 0.0005, + "reward": 0.38028667867183685, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.38028667867183685, + "reward_after_std": 0.7807257771492004, + "reward_before_mean": 0.9740477418527007, + "reward_before_std": 0.6580136283300817, + "reward_change_max": 0.0, + "reward_change_mean": -0.5937610603868961, + "reward_change_min": -1.0487833842635155, + "reward_change_std": 0.3755612950772047, + "reward_std": 0.7807258144021034, + "rewards/cosine_scaled_reward": 0.11202386766672134, + "rewards/format_reward": 0.7500000018626451, + "step": 302 + }, + { + "advantage_max": 1.8254009038209915, + "advantage_mean": -5.587935614226325e-09, + "advantage_min": -0.9064772799611092, + "advantage_std": 0.9998302906751633, + "completion_length": 1085.5208587646484, + "epoch": 0.3462857142857143, + "grad_norm": 0.3099322021007538, + "kl": 0.00725555419921875, + "lambda_div_used": 0.6, + "learning_rate": 4.6259454195101267e-07, + "loss": 0.0003, + "reward": 0.4087141342461109, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4087141342461109, + "reward_after_std": 0.7723647579550743, + "reward_before_mean": 1.0217244923114777, + "reward_before_std": 0.7018113266676664, + "reward_change_max": 0.0, + "reward_change_mean": -0.6130103468894958, + "reward_change_min": -1.0503919497132301, + "reward_change_std": 0.38757357373833656, + "reward_std": 0.7723647765815258, + "rewards/cosine_scaled_reward": 0.052528894040733576, + "rewards/format_reward": 0.916666679084301, + "step": 303 + }, + { + "advantage_max": 1.8945975750684738, + "advantage_mean": 4.1599077515996896e-08, + "advantage_min": -0.7990177571773529, + "advantage_std": 0.9998310655355453, + "completion_length": 1432.6875305175781, + "epoch": 0.3474285714285714, + "grad_norm": 0.2642192244529724, + "kl": 0.01309967041015625, + "lambda_div_used": 0.6, + "learning_rate": 4.59514935484316e-07, + "loss": 0.0005, + "reward": 0.27664350939448923, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.27664350939448923, + "reward_after_std": 0.7569139413535595, + "reward_before_mean": 0.8116565495729446, + "reward_before_std": 0.6271609403192997, + "reward_change_max": 0.0004661977291107178, + "reward_change_mean": -0.5350130377337337, + "reward_change_min": -0.850684642791748, + "reward_change_std": 0.3104001171886921, + "reward_std": 0.7569139413535595, + "rewards/cosine_scaled_reward": -0.021255063824355602, + "rewards/format_reward": 0.8541666716337204, + "step": 304 + }, + { + "advantage_max": 1.7953919023275375, + "advantage_mean": -1.4280279736489376e-08, + "advantage_min": -0.9705780595541, + "advantage_std": 0.9998304322361946, + "completion_length": 1441.645881652832, + "epoch": 0.3485714285714286, + "grad_norm": 0.35386335849761963, + "kl": 0.01190948486328125, + "lambda_div_used": 0.6, + "learning_rate": 4.5643973913200837e-07, + "loss": 0.0005, + "reward": 0.06580792646855116, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.06580792646855116, + "reward_after_std": 0.6579489149153233, + "reward_before_mean": 0.5137032084167004, + "reward_before_std": 0.6019007451832294, + "reward_change_max": 0.0008752346038818359, + "reward_change_mean": -0.4478952884674072, + "reward_change_min": -0.7521562948822975, + "reward_change_std": 0.28863073140382767, + "reward_std": 0.6579489223659039, + "rewards/cosine_scaled_reward": -0.17023173440247774, + "rewards/format_reward": 0.854166679084301, + "step": 305 + }, + { + "advantage_max": 1.8763258010149002, + "advantage_mean": -3.2285850215529877e-08, + "advantage_min": -0.7838627435266972, + "advantage_std": 0.9998114481568336, + "completion_length": 1086.4375381469727, + "epoch": 0.3497142857142857, + "grad_norm": 0.24181218445301056, + "kl": 0.00975799560546875, + "lambda_div_used": 0.6, + "learning_rate": 4.5336910277482155e-07, + "loss": 0.0004, + "reward": 0.48075782135128975, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.48075782135128975, + "reward_after_std": 0.6983195133507252, + "reward_before_mean": 1.1519020181149244, + "reward_before_std": 0.5561750112101436, + "reward_change_max": 0.0, + "reward_change_mean": -0.6711442433297634, + "reward_change_min": -1.0415485128760338, + "reward_change_std": 0.40287046879529953, + "reward_std": 0.6983195282518864, + "rewards/cosine_scaled_reward": 0.09678435418754816, + "rewards/format_reward": 0.9583333358168602, + "step": 306 + }, + { + "advantage_max": 1.8967533856630325, + "advantage_mean": -2.980232305382913e-08, + "advantage_min": -0.8466514945030212, + "advantage_std": 0.9998496323823929, + "completion_length": 1320.62504196167, + "epoch": 0.35085714285714287, + "grad_norm": 0.29407626390457153, + "kl": 0.01064300537109375, + "lambda_div_used": 0.6, + "learning_rate": 4.503031760712397e-07, + "loss": 0.0004, + "reward": 0.28989268373697996, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.28989268373697996, + "reward_after_std": 0.8201549686491489, + "reward_before_mean": 0.8231144957244396, + "reward_before_std": 0.6994161698967218, + "reward_change_max": 0.0011776536703109741, + "reward_change_mean": -0.5332218129187822, + "reward_change_min": -0.8389204330742359, + "reward_change_std": 0.31923081912100315, + "reward_std": 0.8201549835503101, + "rewards/cosine_scaled_reward": -0.025942761451005936, + "rewards/format_reward": 0.8750000055879354, + "step": 307 + }, + { + "advantage_max": 1.8265314847230911, + "advantage_mean": 2.4214386717957836e-08, + "advantage_min": -1.0207050442695618, + "advantage_std": 0.9998078942298889, + "completion_length": 2310.291763305664, + "epoch": 0.352, + "grad_norm": 0.18933595716953278, + "kl": 0.01651763916015625, + "lambda_div_used": 0.6, + "learning_rate": 4.4724210845020494e-07, + "loss": 0.0007, + "reward": 0.1366605656221509, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1366605656221509, + "reward_after_std": 0.7060784362256527, + "reward_before_mean": 0.6142788184806705, + "reward_before_std": 0.6519606187939644, + "reward_change_max": 0.00028352439403533936, + "reward_change_mean": -0.4776182733476162, + "reward_change_min": -0.7881277278065681, + "reward_change_std": 0.29920427687466145, + "reward_std": 0.7060784623026848, + "rewards/cosine_scaled_reward": -0.03661058656871319, + "rewards/format_reward": 0.6875000055879354, + "step": 308 + }, + { + "advantage_max": 1.7872188687324524, + "advantage_mean": 2.4835271617007493e-09, + "advantage_min": -0.9430239722132683, + "advantage_std": 0.9998209476470947, + "completion_length": 1762.8333740234375, + "epoch": 0.35314285714285715, + "grad_norm": 0.22261229157447815, + "kl": 0.01409912109375, + "lambda_div_used": 0.6, + "learning_rate": 4.441860491038345e-07, + "loss": 0.0006, + "reward": 0.12087270012125373, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12087270012125373, + "reward_after_std": 0.7508501298725605, + "reward_before_mean": 0.5837531480938196, + "reward_before_std": 0.7245769258588552, + "reward_change_max": 0.0, + "reward_change_mean": -0.4628804475069046, + "reward_change_min": -0.8756393194198608, + "reward_change_std": 0.3242361284792423, + "reward_std": 0.7508501410484314, + "rewards/cosine_scaled_reward": -0.10395676456391811, + "rewards/format_reward": 0.7916666716337204, + "step": 309 + }, + { + "advantage_max": 1.8359250724315643, + "advantage_mean": -6.208814573582799e-10, + "advantage_min": -0.9323371052742004, + "advantage_std": 0.9998283386230469, + "completion_length": 1442.5625534057617, + "epoch": 0.35428571428571426, + "grad_norm": 0.3380570411682129, + "kl": 0.01674652099609375, + "lambda_div_used": 0.6, + "learning_rate": 4.4113514698014953e-07, + "loss": 0.0007, + "reward": 0.17298853071406484, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.17298853071406484, + "reward_after_std": 0.649883933365345, + "reward_before_mean": 0.6802481282502413, + "reward_before_std": 0.5928199477493763, + "reward_change_max": 0.0, + "reward_change_mean": -0.5072596073150635, + "reward_change_min": -0.874159500002861, + "reward_change_std": 0.3214066829532385, + "reward_std": 0.6498839743435383, + "rewards/cosine_scaled_reward": -0.10779260657727718, + "rewards/format_reward": 0.8958333358168602, + "step": 310 + }, + { + "advantage_max": 1.922322541475296, + "advantage_mean": -4.967053879312289e-09, + "advantage_min": -0.7284784689545631, + "advantage_std": 0.999852791428566, + "completion_length": 1135.8750228881836, + "epoch": 0.3554285714285714, + "grad_norm": 0.22316350042819977, + "kl": 0.008321762084960938, + "lambda_div_used": 0.6, + "learning_rate": 4.3808955077581546e-07, + "loss": 0.0003, + "reward": 0.466993102512788, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.466993102512788, + "reward_after_std": 0.8413489013910294, + "reward_before_mean": 1.0938301347196102, + "reward_before_std": 0.6916535142809153, + "reward_change_max": 0.0, + "reward_change_mean": -0.6268370077013969, + "reward_change_min": -1.0190090090036392, + "reward_change_std": 0.3714268747717142, + "reward_std": 0.8413489013910294, + "rewards/cosine_scaled_reward": 0.04691504535730928, + "rewards/format_reward": 1.0, + "step": 311 + }, + { + "advantage_max": 1.9405202120542526, + "advantage_mean": -4.315127988263612e-08, + "advantage_min": -0.7634863182902336, + "advantage_std": 0.9998544678092003, + "completion_length": 1331.7708587646484, + "epoch": 0.3565714285714286, + "grad_norm": 0.287652850151062, + "kl": 0.011852264404296875, + "lambda_div_used": 0.6, + "learning_rate": 4.350494089288943e-07, + "loss": 0.0005, + "reward": 0.6322616841644049, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6322616841644049, + "reward_after_std": 0.7453844994306564, + "reward_before_mean": 1.3673997856676579, + "reward_before_std": 0.5059651434421539, + "reward_change_max": 0.0, + "reward_change_mean": -0.7351380791515112, + "reward_change_min": -1.0781624987721443, + "reward_change_std": 0.4042064417153597, + "reward_std": 0.7453845143318176, + "rewards/cosine_scaled_reward": 0.24619986613106448, + "rewards/format_reward": 0.8750000111758709, + "step": 312 + }, + { + "advantage_max": 1.8338307291269302, + "advantage_mean": -8.692344399818808e-09, + "advantage_min": -0.9592939876019955, + "advantage_std": 0.9998330473899841, + "completion_length": 1714.4791984558105, + "epoch": 0.3577142857142857, + "grad_norm": 0.24802806973457336, + "kl": 0.01470947265625, + "lambda_div_used": 0.6, + "learning_rate": 4.3201486961161093e-07, + "loss": 0.0006, + "reward": 0.3089195266366005, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.3089195266366005, + "reward_after_std": 0.8284649103879929, + "reward_before_mean": 0.8583853915333748, + "reward_before_std": 0.7549590589478612, + "reward_change_max": 0.0007516443729400635, + "reward_change_mean": -0.5494658825919032, + "reward_change_min": -0.9307254888117313, + "reward_change_std": 0.3660744549706578, + "reward_std": 0.8284649439156055, + "rewards/cosine_scaled_reward": 0.0750260278582573, + "rewards/format_reward": 0.7083333488553762, + "step": 313 + }, + { + "advantage_max": 1.8229438215494156, + "advantage_mean": -8.692344732885715e-09, + "advantage_min": -0.9871519505977631, + "advantage_std": 0.9998666346073151, + "completion_length": 1355.0208778381348, + "epoch": 0.3588571428571429, + "grad_norm": 0.2961985766887665, + "kl": 0.01201629638671875, + "lambda_div_used": 0.6, + "learning_rate": 4.2898608072313045e-07, + "loss": 0.0005, + "reward": 0.42520161904394627, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.42520161904394627, + "reward_after_std": 0.840074960142374, + "reward_before_mean": 1.0372523088008165, + "reward_before_std": 0.7808595895767212, + "reward_change_max": 0.0003578066825866699, + "reward_change_mean": -0.6120506376028061, + "reward_change_min": -1.002462938427925, + "reward_change_std": 0.39382885955274105, + "reward_std": 0.8400749824941158, + "rewards/cosine_scaled_reward": 0.13320946041494608, + "rewards/format_reward": 0.7708333395421505, + "step": 314 + }, + { + "advantage_max": 1.8934376388788223, + "advantage_mean": -1.1020650614312899e-08, + "advantage_min": -0.9089367166161537, + "advantage_std": 0.999834805727005, + "completion_length": 1825.6458587646484, + "epoch": 0.36, + "grad_norm": 0.3252292573451996, + "kl": 0.024791717529296875, + "lambda_div_used": 0.6, + "learning_rate": 4.2596318988235037e-07, + "loss": 0.001, + "reward": 0.2225517202168703, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2225517202168703, + "reward_after_std": 0.6751000694930553, + "reward_before_mean": 0.748973336070776, + "reward_before_std": 0.5620210394263268, + "reward_change_max": 0.0004638954997062683, + "reward_change_mean": -0.5264216009527445, + "reward_change_min": -0.8017611876130104, + "reward_change_std": 0.3129722382873297, + "reward_std": 0.6751000992953777, + "rewards/cosine_scaled_reward": -0.0005133431404829025, + "rewards/format_reward": 0.750000013038516, + "step": 315 + }, + { + "advantage_max": 1.8182682543992996, + "advantage_mean": 4.3461718668424965e-09, + "advantage_min": -0.927369087934494, + "advantage_std": 0.9998475015163422, + "completion_length": 1909.4583892822266, + "epoch": 0.36114285714285715, + "grad_norm": 0.3349359333515167, + "kl": 0.019321441650390625, + "lambda_div_used": 0.6, + "learning_rate": 4.2294634442070553e-07, + "loss": 0.0008, + "reward": 0.019429476466029882, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.019429476466029882, + "reward_after_std": 0.7821727208793163, + "reward_before_mean": 0.4113868260756135, + "reward_before_std": 0.7491345182061195, + "reward_change_max": 0.0010381415486335754, + "reward_change_mean": -0.39195735938847065, + "reward_change_min": -0.6826085112988949, + "reward_change_std": 0.27011805586516857, + "reward_std": 0.7821727506816387, + "rewards/cosine_scaled_reward": -0.1588899241760373, + "rewards/format_reward": 0.7291666828095913, + "step": 316 + }, + { + "advantage_max": 1.8804281800985336, + "advantage_mean": -5.587935725248627e-09, + "advantage_min": -0.7781447246670723, + "advantage_std": 0.9998077526688576, + "completion_length": 1622.0417175292969, + "epoch": 0.36228571428571427, + "grad_norm": 0.3521542549133301, + "kl": 0.017303466796875, + "lambda_div_used": 0.6, + "learning_rate": 4.1993569137498776e-07, + "loss": 0.0007, + "reward": 0.11890967702493072, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11890967702493072, + "reward_after_std": 0.6751161739230156, + "reward_before_mean": 0.587021853774786, + "reward_before_std": 0.5646471055224538, + "reward_change_max": 0.0004993155598640442, + "reward_change_mean": -0.46811217814683914, + "reward_change_min": -0.7818466536700726, + "reward_change_std": 0.29836034402251244, + "reward_std": 0.6751161776483059, + "rewards/cosine_scaled_reward": -0.0710724163800478, + "rewards/format_reward": 0.7291666697710752, + "step": 317 + }, + { + "advantage_max": 1.8058977276086807, + "advantage_mean": -1.9247333504779363e-08, + "advantage_min": -0.9391337558627129, + "advantage_std": 0.9998605996370316, + "completion_length": 1382.2500267028809, + "epoch": 0.36342857142857143, + "grad_norm": 0.5408955812454224, + "kl": 0.020711898803710938, + "lambda_div_used": 0.6, + "learning_rate": 4.1693137748017915e-07, + "loss": 0.0008, + "reward": 0.19501495765871368, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19501495765871368, + "reward_after_std": 0.805657759308815, + "reward_before_mean": 0.6848395057022572, + "reward_before_std": 0.7711063474416733, + "reward_change_max": 0.0032318681478500366, + "reward_change_mean": -0.489824540913105, + "reward_change_min": -0.9173102602362633, + "reward_change_std": 0.3421527110040188, + "reward_std": 0.8056577667593956, + "rewards/cosine_scaled_reward": -0.09508026950061321, + "rewards/format_reward": 0.8750000111758709, + "step": 318 + }, + { + "advantage_max": 1.875873938202858, + "advantage_mean": 4.656612984099695e-09, + "advantage_min": -0.8540654145181179, + "advantage_std": 0.9998373687267303, + "completion_length": 1421.8125610351562, + "epoch": 0.36457142857142855, + "grad_norm": 0.3166425824165344, + "kl": 0.011302947998046875, + "lambda_div_used": 0.6, + "learning_rate": 4.1393354916230005e-07, + "loss": 0.0005, + "reward": 0.0982060037786141, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.0982060037786141, + "reward_after_std": 0.711435116827488, + "reward_before_mean": 0.548051243647933, + "reward_before_std": 0.6334625743329525, + "reward_change_max": 1.6868114471435547e-05, + "reward_change_mean": -0.44984522834420204, + "reward_change_min": -0.7615330517292023, + "reward_change_std": 0.2841661609709263, + "reward_std": 0.7114351354539394, + "rewards/cosine_scaled_reward": -0.17389106666087173, + "rewards/format_reward": 0.8958333432674408, + "step": 319 + }, + { + "advantage_max": 1.844655841588974, + "advantage_mean": 6.208817460162663e-09, + "advantage_min": -0.8964797705411911, + "advantage_std": 0.99985171854496, + "completion_length": 1050.1666946411133, + "epoch": 0.3657142857142857, + "grad_norm": 0.32885971665382385, + "kl": 0.016178131103515625, + "lambda_div_used": 0.6, + "learning_rate": 4.1094235253127374e-07, + "loss": 0.0006, + "reward": 0.34835223481059074, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.34835223481059074, + "reward_after_std": 0.8526491969823837, + "reward_before_mean": 0.905394246801734, + "reward_before_std": 0.7732513155788183, + "reward_change_max": 0.0, + "reward_change_mean": -0.5570420175790787, + "reward_change_min": -0.9411030262708664, + "reward_change_std": 0.35189635306596756, + "reward_std": 0.8526492044329643, + "rewards/cosine_scaled_reward": -0.01605289150029421, + "rewards/format_reward": 0.9375000074505806, + "step": 320 + }, + { + "advantage_max": 1.9735696017742157, + "advantage_mean": -3.725290431688677e-08, + "advantage_min": -0.6550575718283653, + "advantage_std": 0.9998459443449974, + "completion_length": 912.5625152587891, + "epoch": 0.3668571428571429, + "grad_norm": 0.252029687166214, + "kl": 0.0068225860595703125, + "lambda_div_used": 0.6, + "learning_rate": 4.079579333738039e-07, + "loss": 0.0003, + "reward": 0.574149573687464, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.574149573687464, + "reward_after_std": 0.7687293998897076, + "reward_before_mean": 1.2689920328557491, + "reward_before_std": 0.5234791114926338, + "reward_change_max": 0.0, + "reward_change_mean": -0.6948424577713013, + "reward_change_min": -1.0286077186465263, + "reward_change_std": 0.37573184818029404, + "reward_std": 0.7687294036149979, + "rewards/cosine_scaled_reward": 0.13449599593877792, + "rewards/format_reward": 1.0, + "step": 321 + }, + { + "advantage_max": 1.8821515142917633, + "advantage_mean": 6.829699028543246e-09, + "advantage_min": -0.8529209233820438, + "advantage_std": 0.9998260661959648, + "completion_length": 1869.2083854675293, + "epoch": 0.368, + "grad_norm": 0.5277259349822998, + "kl": 0.0365142822265625, + "lambda_div_used": 0.6, + "learning_rate": 4.0498043714627006e-07, + "loss": 0.0015, + "reward": -0.018657252425327897, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.018657252425327897, + "reward_after_std": 0.7735125869512558, + "reward_before_mean": 0.3417943734675646, + "reward_before_std": 0.7077783048152924, + "reward_change_max": 0.001621730625629425, + "reward_change_mean": -0.36045162566006184, + "reward_change_min": -0.6681258156895638, + "reward_change_std": 0.24554293975234032, + "reward_std": 0.773512601852417, + "rewards/cosine_scaled_reward": -0.17285283096134663, + "rewards/format_reward": 0.6875000055879354, + "step": 322 + }, + { + "advantage_max": 1.9139571785926819, + "advantage_mean": 3.3306690738754696e-16, + "advantage_min": -0.8632525950670242, + "advantage_std": 0.9998295903205872, + "completion_length": 1469.3958892822266, + "epoch": 0.36914285714285716, + "grad_norm": 0.43059584498405457, + "kl": 0.01897430419921875, + "lambda_div_used": 0.6, + "learning_rate": 4.020100089676376e-07, + "loss": 0.0008, + "reward": 0.16783785168081522, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16783785168081522, + "reward_after_std": 0.6461106389760971, + "reward_before_mean": 0.665590648073703, + "reward_before_std": 0.5015575010329485, + "reward_change_max": 0.0, + "reward_change_mean": -0.49775280244648457, + "reward_change_min": -0.7560123316943645, + "reward_change_std": 0.2950308583676815, + "reward_std": 0.6461106389760971, + "rewards/cosine_scaled_reward": -0.06303800735622644, + "rewards/format_reward": 0.7916666716337204, + "step": 323 + }, + { + "advantage_max": 1.8660519123077393, + "advantage_mean": -2.048909675256283e-08, + "advantage_min": -0.8690266758203506, + "advantage_std": 0.9998775646090508, + "completion_length": 1304.645866394043, + "epoch": 0.3702857142857143, + "grad_norm": 0.4782713055610657, + "kl": 0.018848419189453125, + "lambda_div_used": 0.6, + "learning_rate": 3.9904679361238526e-07, + "loss": 0.0008, + "reward": 0.309486435726285, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.309486435726285, + "reward_after_std": 0.909374438226223, + "reward_before_mean": 0.8317293685395271, + "reward_before_std": 0.8388629630208015, + "reward_change_max": 0.0, + "reward_change_mean": -0.5222429521381855, + "reward_change_min": -0.950531929731369, + "reward_change_std": 0.3524559233337641, + "reward_std": 0.9093744680285454, + "rewards/cosine_scaled_reward": 0.009614669252187014, + "rewards/format_reward": 0.8125000037252903, + "step": 324 + }, + { + "advantage_max": 1.8715059906244278, + "advantage_mean": -1.6763806787167823e-08, + "advantage_min": -0.8543071523308754, + "advantage_std": 0.9998787268996239, + "completion_length": 1794.1666946411133, + "epoch": 0.37142857142857144, + "grad_norm": 0.29662418365478516, + "kl": 0.016979217529296875, + "lambda_div_used": 0.6, + "learning_rate": 3.9609093550344907e-07, + "loss": 0.0007, + "reward": 0.2978088464587927, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2978088464587927, + "reward_after_std": 0.8947106897830963, + "reward_before_mean": 0.8223866783082485, + "reward_before_std": 0.7958251684904099, + "reward_change_max": 0.0008146986365318298, + "reward_change_mean": -0.5245778262615204, + "reward_change_min": -0.9071747809648514, + "reward_change_std": 0.3426001761108637, + "reward_std": 0.8947107121348381, + "rewards/cosine_scaled_reward": 0.046609988203272223, + "rewards/format_reward": 0.729166679084301, + "step": 325 + }, + { + "advantage_max": 1.8180214762687683, + "advantage_mean": -2.6697914268236644e-08, + "advantage_min": -0.9289712607860565, + "advantage_std": 0.9998547285795212, + "completion_length": 1444.2292098999023, + "epoch": 0.37257142857142855, + "grad_norm": 0.3878525495529175, + "kl": 0.02288818359375, + "lambda_div_used": 0.6, + "learning_rate": 3.931425787051832e-07, + "loss": 0.0009, + "reward": 0.26968370797112584, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.26968370797112584, + "reward_after_std": 0.8076260313391685, + "reward_before_mean": 0.80234469845891, + "reward_before_std": 0.7833601534366608, + "reward_change_max": 0.0002892613410949707, + "reward_change_mean": -0.5326609779149294, + "reward_change_min": -0.9844692200422287, + "reward_change_std": 0.37313529662787914, + "reward_std": 0.8076260536909103, + "rewards/cosine_scaled_reward": 0.005338998977094889, + "rewards/format_reward": 0.7916666716337204, + "step": 326 + }, + { + "advantage_max": 1.9538254141807556, + "advantage_mean": -3.476937671109681e-08, + "advantage_min": -0.742390725761652, + "advantage_std": 0.9998945817351341, + "completion_length": 1554.2500381469727, + "epoch": 0.3737142857142857, + "grad_norm": 0.3174353539943695, + "kl": 0.012559890747070312, + "lambda_div_used": 0.6, + "learning_rate": 3.902018669163384e-07, + "loss": 0.0005, + "reward": 0.49121918249875307, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.49121918249875307, + "reward_after_std": 1.0039337873458862, + "reward_before_mean": 1.0879922366584651, + "reward_before_std": 0.8203750047832727, + "reward_change_max": 0.0, + "reward_change_mean": -0.596773061901331, + "reward_change_min": -0.9268834516406059, + "reward_change_std": 0.33764065988361835, + "reward_std": 1.003933809697628, + "rewards/cosine_scaled_reward": 0.12732943054288626, + "rewards/format_reward": 0.833333333954215, + "step": 327 + }, + { + "advantage_max": 1.8848908692598343, + "advantage_mean": 3.166496809203778e-08, + "advantage_min": -0.8913846984505653, + "advantage_std": 0.9998078942298889, + "completion_length": 1600.4792098999023, + "epoch": 0.37485714285714283, + "grad_norm": 0.3274012506008148, + "kl": 0.017192840576171875, + "lambda_div_used": 0.6, + "learning_rate": 3.872689434630585e-07, + "loss": 0.0007, + "reward": 0.1585409319959581, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.1585409319959581, + "reward_after_std": 0.7077501621097326, + "reward_before_mean": 0.6421646438539028, + "reward_before_std": 0.6207192353904247, + "reward_change_max": 0.00017159432172775269, + "reward_change_mean": -0.48362372256815434, + "reward_change_min": -0.842924177646637, + "reward_change_std": 0.3040748070925474, + "reward_std": 0.7077501658350229, + "rewards/cosine_scaled_reward": -0.06433434877544641, + "rewards/format_reward": 0.770833333954215, + "step": 328 + }, + { + "advantage_max": 1.8619430512189865, + "advantage_mean": -9.468446360294536e-09, + "advantage_min": -0.9495992064476013, + "advantage_std": 0.9998591840267181, + "completion_length": 929.3542060852051, + "epoch": 0.376, + "grad_norm": 0.30674153566360474, + "kl": 0.0108184814453125, + "lambda_div_used": 0.6, + "learning_rate": 3.843439512918949e-07, + "loss": 0.0004, + "reward": 0.4454690790735185, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4454690790735185, + "reward_after_std": 0.8043604120612144, + "reward_before_mean": 1.0710258733015507, + "reward_before_std": 0.7005043588578701, + "reward_change_max": 0.0, + "reward_change_mean": -0.625556755810976, + "reward_change_min": -0.9712866432964802, + "reward_change_std": 0.37744545564055443, + "reward_std": 0.8043604269623756, + "rewards/cosine_scaled_reward": 0.0667628962546587, + "rewards/format_reward": 0.9375000074505806, + "step": 329 + }, + { + "advantage_max": 1.9051975160837173, + "advantage_mean": -7.062529561174813e-09, + "advantage_min": -0.834363654255867, + "advantage_std": 0.9998826459050179, + "completion_length": 1213.083366394043, + "epoch": 0.37714285714285717, + "grad_norm": 0.4489211440086365, + "kl": 0.019287109375, + "lambda_div_used": 0.6, + "learning_rate": 3.8142703296283953e-07, + "loss": 0.0008, + "reward": 0.23261515237390995, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.23261515237390995, + "reward_after_std": 1.0043475143611431, + "reward_before_mean": 0.6893374051433057, + "reward_before_std": 0.9294507168233395, + "reward_change_max": 0.0012516528367996216, + "reward_change_mean": -0.45672223158180714, + "reward_change_min": -0.8383929058909416, + "reward_change_std": 0.31927052699029446, + "reward_std": 1.0043475553393364, + "rewards/cosine_scaled_reward": -0.0719979761634022, + "rewards/format_reward": 0.8333333432674408, + "step": 330 + }, + { + "advantage_max": 1.80654077231884, + "advantage_mean": -3.042320462220971e-08, + "advantage_min": -0.9978099688887596, + "advantage_std": 0.9997223243117332, + "completion_length": 1831.7708625793457, + "epoch": 0.3782857142857143, + "grad_norm": 0.3333419859409332, + "kl": 0.01973724365234375, + "lambda_div_used": 0.6, + "learning_rate": 3.785183306423767e-07, + "loss": 0.0008, + "reward": -0.053731471532955766, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.053731471532955766, + "reward_after_std": 0.4776611737906933, + "reward_before_mean": 0.3651720993220806, + "reward_before_std": 0.4157779663801193, + "reward_change_max": 0.00035496801137924194, + "reward_change_mean": -0.4189036013558507, + "reward_change_min": -0.6600331999361515, + "reward_change_std": 0.2541554179042578, + "reward_std": 0.4776612054556608, + "rewards/cosine_scaled_reward": -0.1507472936064005, + "rewards/format_reward": 0.6666666679084301, + "step": 331 + }, + { + "advantage_max": 1.8885580450296402, + "advantage_mean": -2.8560560139112567e-08, + "advantage_min": -0.7655205801129341, + "advantage_std": 0.9998506605625153, + "completion_length": 1258.0416870117188, + "epoch": 0.37942857142857145, + "grad_norm": 0.6487593650817871, + "kl": 0.017627716064453125, + "lambda_div_used": 0.6, + "learning_rate": 3.7561798609655373e-07, + "loss": 0.0007, + "reward": 0.21511683403514326, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.21511683403514326, + "reward_after_std": 0.8157045319676399, + "reward_before_mean": 0.7045507121365517, + "reward_before_std": 0.7418462634086609, + "reward_change_max": 0.00038327276706695557, + "reward_change_mean": -0.48943387530744076, + "reward_change_min": -0.854445330798626, + "reward_change_std": 0.3147754594683647, + "reward_std": 0.8157045654952526, + "rewards/cosine_scaled_reward": -0.08522465638816357, + "rewards/format_reward": 0.8750000055879354, + "step": 332 + }, + { + "advantage_max": 1.885246142745018, + "advantage_mean": -3.0423204400165105e-08, + "advantage_min": -0.8369051367044449, + "advantage_std": 0.9998035356402397, + "completion_length": 1188.6458587646484, + "epoch": 0.38057142857142856, + "grad_norm": 0.3059176504611969, + "kl": 0.012359619140625, + "lambda_div_used": 0.6, + "learning_rate": 3.72726140684072e-07, + "loss": 0.0005, + "reward": 0.38543284498155117, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.38543284498155117, + "reward_after_std": 0.6527124047279358, + "reward_before_mean": 1.0101193934679031, + "reward_before_std": 0.5245114741846919, + "reward_change_max": 0.0, + "reward_change_mean": -0.6246866025030613, + "reward_change_min": -0.9763289764523506, + "reward_change_std": 0.3626061100512743, + "reward_std": 0.6527124308049679, + "rewards/cosine_scaled_reward": 0.025893021374940872, + "rewards/format_reward": 0.9583333432674408, + "step": 333 + }, + { + "advantage_max": 1.857540786266327, + "advantage_mean": 8.071462720415923e-09, + "advantage_min": -0.799895916134119, + "advantage_std": 0.9998114556074142, + "completion_length": 2156.4167251586914, + "epoch": 0.38171428571428573, + "grad_norm": 0.33751410245895386, + "kl": 0.02600860595703125, + "lambda_div_used": 0.6, + "learning_rate": 3.6984293534939737e-07, + "loss": 0.001, + "reward": -0.1896799481473863, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.1896799481473863, + "reward_after_std": 0.6050053462386131, + "reward_before_mean": 0.12263576500117779, + "reward_before_std": 0.5591881051659584, + "reward_change_max": 0.0009383410215377808, + "reward_change_mean": -0.31231571082025766, + "reward_change_min": -0.5923841707408428, + "reward_change_std": 0.21185853891074657, + "reward_std": 0.6050053536891937, + "rewards/cosine_scaled_reward": -0.23034879751503468, + "rewards/format_reward": 0.5833333395421505, + "step": 334 + }, + { + "advantage_max": 1.8346337229013443, + "advantage_mean": -9.934107647602275e-09, + "advantage_min": -0.9422204568982124, + "advantage_std": 0.999869279563427, + "completion_length": 1255.1250534057617, + "epoch": 0.38285714285714284, + "grad_norm": 0.20903566479682922, + "kl": 0.008930206298828125, + "lambda_div_used": 0.6, + "learning_rate": 3.6696851061588994e-07, + "loss": 0.0004, + "reward": 0.4035352533683181, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.4035352533683181, + "reward_after_std": 0.8650985062122345, + "reward_before_mean": 0.9930033460259438, + "reward_before_std": 0.7928112968802452, + "reward_change_max": 0.00031920522451400757, + "reward_change_mean": -0.5894680954515934, + "reward_change_min": -0.9780095741152763, + "reward_change_std": 0.37678316980600357, + "reward_std": 0.8650985211133957, + "rewards/cosine_scaled_reward": 0.03816831856966019, + "rewards/format_reward": 0.9166666716337204, + "step": 335 + }, + { + "advantage_max": 1.6843600422143936, + "advantage_mean": -2.0489098251363913e-08, + "advantage_min": -1.1067031100392342, + "advantage_std": 0.9998784810304642, + "completion_length": 1705.6042098999023, + "epoch": 0.384, + "grad_norm": 0.3693605065345764, + "kl": 0.018991470336914062, + "lambda_div_used": 0.6, + "learning_rate": 3.641030065789562e-07, + "loss": 0.0008, + "reward": 0.3400895514059812, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3400895514059812, + "reward_after_std": 0.96578698605299, + "reward_before_mean": 0.8908323775976896, + "reward_before_std": 1.0428907200694084, + "reward_change_max": 0.0005368068814277649, + "reward_change_mean": -0.5507428087294102, + "reward_change_min": -1.1123467236757278, + "reward_change_std": 0.4507921673357487, + "reward_std": 0.9657870009541512, + "rewards/cosine_scaled_reward": 0.1016661785542965, + "rewards/format_reward": 0.6875000149011612, + "step": 336 + }, + { + "advantage_max": 1.8957796841859818, + "advantage_mean": -3.725290520506519e-09, + "advantage_min": -0.7205033674836159, + "advantage_std": 0.9998714700341225, + "completion_length": 1377.7083740234375, + "epoch": 0.3851428571428571, + "grad_norm": 0.25850510597229004, + "kl": 0.011775970458984375, + "lambda_div_used": 0.6, + "learning_rate": 3.612465628992203e-07, + "loss": 0.0005, + "reward": 0.3554620500653982, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3554620500653982, + "reward_after_std": 0.9742900244891644, + "reward_before_mean": 0.8959630839526653, + "reward_before_std": 0.8873303392902017, + "reward_change_max": 0.0, + "reward_change_mean": -0.540501032024622, + "reward_change_min": -1.0388685837388039, + "reward_change_std": 0.36955892480909824, + "reward_std": 0.9742900505661964, + "rewards/cosine_scaled_reward": -0.03118512872606516, + "rewards/format_reward": 0.9583333432674408, + "step": 337 + }, + { + "advantage_max": 1.8836781829595566, + "advantage_mean": 1.1175870007207322e-08, + "advantage_min": -0.8648780807852745, + "advantage_std": 0.9998218789696693, + "completion_length": 1293.7917098999023, + "epoch": 0.3862857142857143, + "grad_norm": 0.2512975037097931, + "kl": 0.009267807006835938, + "lambda_div_used": 0.6, + "learning_rate": 3.5839931879571725e-07, + "loss": 0.0004, + "reward": 0.318469176068902, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.318469176068902, + "reward_after_std": 0.7098839841783047, + "reward_before_mean": 0.8908936120569706, + "reward_before_std": 0.5762915294617414, + "reward_change_max": 0.0, + "reward_change_mean": -0.5724244341254234, + "reward_change_min": -0.8914236910641193, + "reward_change_std": 0.32981424406170845, + "reward_std": 0.709883987903595, + "rewards/cosine_scaled_reward": 0.007946796715259552, + "rewards/format_reward": 0.875, + "step": 338 + }, + { + "advantage_max": 1.8676584959030151, + "advantage_mean": 1.4280279680978225e-08, + "advantage_min": -0.8956063166260719, + "advantage_std": 0.9998053833842278, + "completion_length": 1835.2083435058594, + "epoch": 0.38742857142857146, + "grad_norm": 0.3322133719921112, + "kl": 0.0276947021484375, + "lambda_div_used": 0.6, + "learning_rate": 3.555614130391079e-07, + "loss": 0.0011, + "reward": -0.054544417187571526, + "reward_advantage_correlation": 1.0, + "reward_after_mean": -0.054544417187571526, + "reward_after_std": 0.6542263887822628, + "reward_before_mean": 0.32189842336811125, + "reward_before_std": 0.6224677935242653, + "reward_change_max": 0.0010414794087409973, + "reward_change_mean": -0.3764428086578846, + "reward_change_min": -0.6626890636980534, + "reward_change_std": 0.26315341144800186, + "reward_std": 0.654226390644908, + "rewards/cosine_scaled_reward": -0.16196747706271708, + "rewards/format_reward": 0.6458333488553762, + "step": 339 + }, + { + "advantage_max": 1.9051592350006104, + "advantage_mean": -3.166496753692627e-08, + "advantage_min": -0.8037348762154579, + "advantage_std": 0.9998365417122841, + "completion_length": 1288.6042098999023, + "epoch": 0.38857142857142857, + "grad_norm": 0.25094640254974365, + "kl": 0.010942459106445312, + "lambda_div_used": 0.6, + "learning_rate": 3.5273298394491515e-07, + "loss": 0.0004, + "reward": 0.28444598644273356, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.28444598644273356, + "reward_after_std": 0.6651497408747673, + "reward_before_mean": 0.8474188968539238, + "reward_before_std": 0.528843104839325, + "reward_change_max": 0.0, + "reward_change_mean": -0.5629729311913252, + "reward_change_min": -0.9151150286197662, + "reward_change_std": 0.3289155066013336, + "reward_std": 0.6651497483253479, + "rewards/cosine_scaled_reward": -0.013790564611554146, + "rewards/format_reward": 0.8750000055879354, + "step": 340 + }, + { + "advantage_max": 1.8734830617904663, + "advantage_mean": -4.34617203337595e-09, + "advantage_min": -0.8691454865038395, + "advantage_std": 0.9998512864112854, + "completion_length": 1238.9166946411133, + "epoch": 0.38971428571428574, + "grad_norm": 0.31012919545173645, + "kl": 0.014537811279296875, + "lambda_div_used": 0.6, + "learning_rate": 3.4991416936678276e-07, + "loss": 0.0006, + "reward": 0.577342574018985, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.577342574018985, + "reward_after_std": 0.8173014968633652, + "reward_before_mean": 1.2706860266625881, + "reward_before_std": 0.6470533646643162, + "reward_change_max": 0.0009817034006118774, + "reward_change_mean": -0.6933434382081032, + "reward_change_min": -1.1341840326786041, + "reward_change_std": 0.4181734323501587, + "reward_std": 0.8173014968633652, + "rewards/cosine_scaled_reward": 0.19784300029277802, + "rewards/format_reward": 0.8750000074505806, + "step": 341 + }, + { + "advantage_max": 1.8878445029258728, + "advantage_mean": -1.800557042352935e-08, + "advantage_min": -0.7828846871852875, + "advantage_std": 0.9998863115906715, + "completion_length": 1433.562515258789, + "epoch": 0.39085714285714285, + "grad_norm": 0.4130045771598816, + "kl": 0.020595550537109375, + "lambda_div_used": 0.6, + "learning_rate": 3.471051066897562e-07, + "loss": 0.0008, + "reward": 0.36572215892374516, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.36572215892374516, + "reward_after_std": 1.022568255662918, + "reward_before_mean": 0.8937026299536228, + "reward_before_std": 0.9488763399422169, + "reward_change_max": 0.0, + "reward_change_mean": -0.5279804691672325, + "reward_change_min": -1.0131430327892303, + "reward_change_std": 0.36056538484990597, + "reward_std": 1.0225682780146599, + "rewards/cosine_scaled_reward": -0.0010653771460056305, + "rewards/format_reward": 0.8958333432674408, + "step": 342 + }, + { + "advantage_max": 1.852153331041336, + "advantage_mean": 6.208817904251873e-10, + "advantage_min": -0.8911474421620369, + "advantage_std": 0.9999059438705444, + "completion_length": 1568.7917098999023, + "epoch": 0.392, + "grad_norm": 0.3222767114639282, + "kl": 0.019664764404296875, + "lambda_div_used": 0.6, + "learning_rate": 3.4430593282358777e-07, + "loss": 0.0008, + "reward": 0.5172849660739303, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5172849660739303, + "reward_after_std": 1.1370365917682648, + "reward_before_mean": 1.1116691306233406, + "reward_before_std": 1.0664770379662514, + "reward_change_max": 0.0, + "reward_change_mean": -0.5943841449916363, + "reward_change_min": -1.137386292219162, + "reward_change_std": 0.41275884211063385, + "reward_std": 1.1370366215705872, + "rewards/cosine_scaled_reward": 0.09750120915123262, + "rewards/format_reward": 0.9166666716337204, + "step": 343 + }, + { + "advantage_max": 1.8314592689275742, + "advantage_mean": -8.22668272393301e-09, + "advantage_min": -1.0410834476351738, + "advantage_std": 0.9998433589935303, + "completion_length": 1337.0417022705078, + "epoch": 0.3931428571428571, + "grad_norm": 0.33922290802001953, + "kl": 0.013172149658203125, + "lambda_div_used": 0.6, + "learning_rate": 3.4151678419606233e-07, + "loss": 0.0005, + "reward": 0.5709733965341002, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5709733965341002, + "reward_after_std": 0.7575945109128952, + "reward_before_mean": 1.2816130220890045, + "reward_before_std": 0.6747047891840339, + "reward_change_max": 0.0, + "reward_change_mean": -0.7106395661830902, + "reward_change_min": -1.081616371870041, + "reward_change_std": 0.425860196352005, + "reward_std": 0.7575945146381855, + "rewards/cosine_scaled_reward": 0.203306476585567, + "rewards/format_reward": 0.8750000111758709, + "step": 344 + }, + { + "advantage_max": 1.7418056577444077, + "advantage_mean": -1.800557097864086e-08, + "advantage_min": -0.9639134332537651, + "advantage_std": 0.9998338147997856, + "completion_length": 1639.1041946411133, + "epoch": 0.3942857142857143, + "grad_norm": 0.4215705692768097, + "kl": 0.027713775634765625, + "lambda_div_used": 0.6, + "learning_rate": 3.387377967463493e-07, + "loss": 0.0011, + "reward": 0.3227203474380076, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3227203474380076, + "reward_after_std": 0.686117697507143, + "reward_before_mean": 0.9131767749786377, + "reward_before_std": 0.6375276725739241, + "reward_change_max": 0.00036709755659103394, + "reward_change_mean": -0.5904564298689365, + "reward_change_min": -0.9719649441540241, + "reward_change_std": 0.3853672593832016, + "reward_std": 0.6861177161335945, + "rewards/cosine_scaled_reward": 0.05033837631344795, + "rewards/format_reward": 0.8125000111758709, + "step": 345 + }, + { + "advantage_max": 1.8243045955896378, + "advantage_mean": -2.8560559584001055e-08, + "advantage_min": -1.0744804069399834, + "advantage_std": 0.9998216480016708, + "completion_length": 1337.2083587646484, + "epoch": 0.3954285714285714, + "grad_norm": 0.289594829082489, + "kl": 0.013706207275390625, + "lambda_div_used": 0.6, + "learning_rate": 3.359691059183761e-07, + "loss": 0.0005, + "reward": 0.27365553192794323, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.27365553192794323, + "reward_after_std": 0.6229892708361149, + "reward_before_mean": 0.8438735082745552, + "reward_before_std": 0.560555960983038, + "reward_change_max": 0.0, + "reward_change_mean": -0.570217976346612, + "reward_change_min": -0.8716960623860359, + "reward_change_std": 0.3414479810744524, + "reward_std": 0.6229892894625664, + "rewards/cosine_scaled_reward": -0.015563266351819038, + "rewards/format_reward": 0.8750000111758709, + "step": 346 + }, + { + "advantage_max": 1.8952476382255554, + "advantage_mean": -9.934107647602275e-09, + "advantage_min": -0.8498244062066078, + "advantage_std": 0.9998175203800201, + "completion_length": 1423.2291946411133, + "epoch": 0.3965714285714286, + "grad_norm": 0.2076168954372406, + "kl": 0.013187408447265625, + "lambda_div_used": 0.6, + "learning_rate": 3.3321084665422803e-07, + "loss": 0.0005, + "reward": 0.11660511128138751, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.11660511128138751, + "reward_after_std": 0.6356964744627476, + "reward_before_mean": 0.5917580462992191, + "reward_before_std": 0.5249120108783245, + "reward_change_max": 0.0, + "reward_change_mean": -0.475152924656868, + "reward_change_min": -0.7603438273072243, + "reward_change_std": 0.27699695713818073, + "reward_std": 0.6356964930891991, + "rewards/cosine_scaled_reward": -0.19370432803407311, + "rewards/format_reward": 0.9791666716337204, + "step": 347 + }, + { + "advantage_max": 1.8410307168960571, + "advantage_mean": -2.0178656634506353e-08, + "advantage_min": -0.9013228639960289, + "advantage_std": 0.9998410418629646, + "completion_length": 1313.1458740234375, + "epoch": 0.3977142857142857, + "grad_norm": 0.3774625360965729, + "kl": 0.018695831298828125, + "lambda_div_used": 0.6, + "learning_rate": 3.3046315338757026e-07, + "loss": 0.0007, + "reward": 0.379140455275774, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.379140455275774, + "reward_after_std": 0.7235062941908836, + "reward_before_mean": 0.9868384040892124, + "reward_before_std": 0.628569420427084, + "reward_change_max": 0.0, + "reward_change_mean": -0.6076979450881481, + "reward_change_min": -1.0189422145485878, + "reward_change_std": 0.375955443829298, + "reward_std": 0.7235063090920448, + "rewards/cosine_scaled_reward": 0.045502522960305214, + "rewards/format_reward": 0.8958333432674408, + "step": 348 + }, + { + "advantage_max": 1.8501978665590286, + "advantage_mean": -9.002785184009099e-09, + "advantage_min": -0.9660811722278595, + "advantage_std": 0.9998482018709183, + "completion_length": 1379.395866394043, + "epoch": 0.39885714285714285, + "grad_norm": 0.35212260484695435, + "kl": 0.018650054931640625, + "lambda_div_used": 0.6, + "learning_rate": 3.2772616003709616e-07, + "loss": 0.0007, + "reward": 0.33767664176411927, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.33767664176411927, + "reward_after_std": 0.8225687071681023, + "reward_before_mean": 0.9022767737042159, + "reward_before_std": 0.7706983331590891, + "reward_change_max": 0.0, + "reward_change_mean": -0.5646001249551773, + "reward_change_min": -0.9951711148023605, + "reward_change_std": 0.3776344805955887, + "reward_std": 0.822568740695715, + "rewards/cosine_scaled_reward": -0.017611628398299217, + "rewards/format_reward": 0.9375000074505806, + "step": 349 + }, + { + "advantage_max": 1.929465413093567, + "advantage_mean": 1.1175871117430347e-08, + "advantage_min": -0.7032685950398445, + "advantage_std": 0.9998827949166298, + "completion_length": 911.0000305175781, + "epoch": 0.4, + "grad_norm": 0.377990186214447, + "kl": 0.007293701171875, + "lambda_div_used": 0.6, + "learning_rate": 3.250000000000001e-07, + "loss": 0.0003, + "reward": 0.399724748916924, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.399724748916924, + "reward_after_std": 1.016660876572132, + "reward_before_mean": 0.9504643231630325, + "reward_before_std": 0.8899853974580765, + "reward_change_max": 0.0, + "reward_change_mean": -0.5507395602762699, + "reward_change_min": -0.9786839932203293, + "reward_change_std": 0.34146598540246487, + "reward_std": 1.0166608802974224, + "rewards/cosine_scaled_reward": -0.0247678579762578, + "rewards/format_reward": 1.0, + "step": 350 + }, + { + "advantage_max": 1.9297864735126495, + "advantage_mean": -1.4901161526914564e-08, + "advantage_min": -0.7317093014717102, + "advantage_std": 0.9998589679598808, + "completion_length": 1339.6667175292969, + "epoch": 0.40114285714285713, + "grad_norm": 0.30346372723579407, + "kl": 0.01879119873046875, + "lambda_div_used": 0.6, + "learning_rate": 3.222848061454764e-07, + "loss": 0.0008, + "reward": 0.3753087054938078, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3753087054938078, + "reward_after_std": 0.8502420969307423, + "reward_before_mean": 0.945469731464982, + "reward_before_std": 0.710259310901165, + "reward_change_max": 0.0, + "reward_change_mean": -0.5701610464602709, + "reward_change_min": -0.9608272425830364, + "reward_change_std": 0.3396002743393183, + "reward_std": 0.8502420969307423, + "rewards/cosine_scaled_reward": 0.014401533640921116, + "rewards/format_reward": 0.9166666679084301, + "step": 351 + }, + { + "advantage_max": 1.8952407985925674, + "advantage_mean": -5.277494968813912e-09, + "advantage_min": -0.7425141632556915, + "advantage_std": 0.9998616054654121, + "completion_length": 1668.0208587646484, + "epoch": 0.4022857142857143, + "grad_norm": 0.36306384205818176, + "kl": 0.027172088623046875, + "lambda_div_used": 0.6, + "learning_rate": 3.195807108082429e-07, + "loss": 0.0011, + "reward": 0.17020612582564354, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.17020612582564354, + "reward_after_std": 0.9018196687102318, + "reward_before_mean": 0.6187307387590408, + "reward_before_std": 0.8488708660006523, + "reward_change_max": 0.0, + "reward_change_mean": -0.44852462224662304, + "reward_change_min": -0.8682103119790554, + "reward_change_std": 0.32232335302978754, + "reward_std": 0.9018196910619736, + "rewards/cosine_scaled_reward": -0.06563465157523751, + "rewards/format_reward": 0.7500000018626451, + "step": 352 + }, + { + "advantage_max": 1.8312696814537048, + "advantage_mean": -6.829699583654758e-09, + "advantage_min": -0.9065942466259003, + "advantage_std": 0.999829463660717, + "completion_length": 1157.708366394043, + "epoch": 0.4034285714285714, + "grad_norm": 0.3327624797821045, + "kl": 0.010297775268554688, + "lambda_div_used": 0.6, + "learning_rate": 3.168878457820915e-07, + "loss": 0.0004, + "reward": 0.33716506394557655, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.33716506394557655, + "reward_after_std": 0.7595427818596363, + "reward_before_mean": 0.9169227406382561, + "reward_before_std": 0.7094110660254955, + "reward_change_max": 0.0, + "reward_change_mean": -0.5797576904296875, + "reward_change_min": -1.044652320444584, + "reward_change_std": 0.3778637405484915, + "reward_std": 0.7595428004860878, + "rewards/cosine_scaled_reward": -0.031121966429054737, + "rewards/format_reward": 0.9791666716337204, + "step": 353 + }, + { + "advantage_max": 1.874330386519432, + "advantage_mean": -1.1796752963366686e-08, + "advantage_min": -0.8423102647066116, + "advantage_std": 0.9998380094766617, + "completion_length": 1096.2292098999023, + "epoch": 0.4045714285714286, + "grad_norm": 0.23596711456775665, + "kl": 0.01212310791015625, + "lambda_div_used": 0.6, + "learning_rate": 3.142063423134644e-07, + "loss": 0.0005, + "reward": 0.4981326200067997, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4981326200067997, + "reward_after_std": 0.7721955999732018, + "reward_before_mean": 1.1596158780157566, + "reward_before_std": 0.6328756362199783, + "reward_change_max": 0.0, + "reward_change_mean": -0.6614832170307636, + "reward_change_min": -1.0063703507184982, + "reward_change_std": 0.38712820969522, + "reward_std": 0.772195640951395, + "rewards/cosine_scaled_reward": 0.0902245668694377, + "rewards/format_reward": 0.9791666716337204, + "step": 354 + }, + { + "advantage_max": 1.8732577413320541, + "advantage_mean": -1.4901161637936866e-08, + "advantage_min": -0.8534680753946304, + "advantage_std": 0.9998545199632645, + "completion_length": 1047.4166946411133, + "epoch": 0.4057142857142857, + "grad_norm": 0.29967862367630005, + "kl": 0.013187408447265625, + "lambda_div_used": 0.6, + "learning_rate": 3.115363310950578e-07, + "loss": 0.0005, + "reward": 0.5733014561701566, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5733014561701566, + "reward_after_std": 0.8431755602359772, + "reward_before_mean": 1.2605398669838905, + "reward_before_std": 0.706612853333354, + "reward_change_max": 0.0, + "reward_change_mean": -0.6872383803129196, + "reward_change_min": -1.055651679635048, + "reward_change_std": 0.41943654976785183, + "reward_std": 0.8431755751371384, + "rewards/cosine_scaled_reward": 0.15110323758563027, + "rewards/format_reward": 0.9583333432674408, + "step": 355 + }, + { + "advantage_max": 1.8735899478197098, + "advantage_mean": -6.014791997799307e-09, + "advantage_min": -0.8832316547632217, + "advantage_std": 0.9998319298028946, + "completion_length": 1665.6667175292969, + "epoch": 0.40685714285714286, + "grad_norm": 0.24277737736701965, + "kl": 0.02996063232421875, + "lambda_div_used": 0.6, + "learning_rate": 3.0887794225945143e-07, + "loss": 0.0012, + "reward": 0.21907207665208261, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.21907207665208261, + "reward_after_std": 0.7408511564135551, + "reward_before_mean": 0.7323421612381935, + "reward_before_std": 0.6679595708847046, + "reward_change_max": 0.002227187156677246, + "reward_change_mean": -0.5132700940594077, + "reward_change_min": -0.8826829344034195, + "reward_change_std": 0.3406843263655901, + "reward_std": 0.7408512085676193, + "rewards/cosine_scaled_reward": -0.050495600793510675, + "rewards/format_reward": 0.8333333414047956, + "step": 356 + }, + { + "advantage_max": 1.8176670521497726, + "advantage_mean": -6.208817904251873e-10, + "advantage_min": -0.9782910048961639, + "advantage_std": 0.9998316466808319, + "completion_length": 1771.520881652832, + "epoch": 0.408, + "grad_norm": 0.3803209960460663, + "kl": 0.026142120361328125, + "lambda_div_used": 0.6, + "learning_rate": 3.062313053727671e-07, + "loss": 0.001, + "reward": 0.07317158195655793, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.07317158195655793, + "reward_after_std": 0.6787878163158894, + "reward_before_mean": 0.5202360115945339, + "reward_before_std": 0.6421620734035969, + "reward_change_max": 0.0007302388548851013, + "reward_change_mean": -0.44706446304917336, + "reward_change_min": -0.7746949233114719, + "reward_change_std": 0.30314655415713787, + "reward_std": 0.6787878349423409, + "rewards/cosine_scaled_reward": -0.17738200351595879, + "rewards/format_reward": 0.8750000149011612, + "step": 357 + }, + { + "advantage_max": 1.9141297489404678, + "advantage_mean": -1.8005570368018198e-08, + "advantage_min": -0.758477583527565, + "advantage_std": 0.9998798295855522, + "completion_length": 1370.3333587646484, + "epoch": 0.40914285714285714, + "grad_norm": 0.23564986884593964, + "kl": 0.0139007568359375, + "lambda_div_used": 0.6, + "learning_rate": 3.0359654942835247e-07, + "loss": 0.0006, + "reward": 0.49675935972481966, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.49675935972481966, + "reward_after_std": 0.9583783783018589, + "reward_before_mean": 1.1124700075015426, + "reward_before_std": 0.8288324661552906, + "reward_change_max": 0.0, + "reward_change_mean": -0.6157106459140778, + "reward_change_min": -1.02113651111722, + "reward_change_std": 0.37534380331635475, + "reward_std": 0.9583783894777298, + "rewards/cosine_scaled_reward": 0.09790166141465306, + "rewards/format_reward": 0.9166666679084301, + "step": 358 + }, + { + "advantage_max": 1.8960884362459183, + "advantage_mean": -1.4590720853746575e-08, + "advantage_min": -0.8398799225687981, + "advantage_std": 0.9998002797365189, + "completion_length": 902.458366394043, + "epoch": 0.4102857142857143, + "grad_norm": 0.3054303526878357, + "kl": 0.00920867919921875, + "lambda_div_used": 0.6, + "learning_rate": 3.0097380284049523e-07, + "loss": 0.0004, + "reward": 0.3241674543824047, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3241674543824047, + "reward_after_std": 0.6032501570880413, + "reward_before_mean": 0.9223264642059803, + "reward_before_std": 0.4702158570289612, + "reward_change_max": 0.0, + "reward_change_mean": -0.598159022629261, + "reward_change_min": -0.9134200140833855, + "reward_change_std": 0.33777002803981304, + "reward_std": 0.6032501794397831, + "rewards/cosine_scaled_reward": -0.028420104179531336, + "rewards/format_reward": 0.9791666716337204, + "step": 359 + }, + { + "advantage_max": 1.8649418652057648, + "advantage_mean": -2.250696218286663e-08, + "advantage_min": -0.8667888045310974, + "advantage_std": 0.9998376667499542, + "completion_length": 1136.8333435058594, + "epoch": 0.4114285714285714, + "grad_norm": 0.27571192383766174, + "kl": 0.016468048095703125, + "lambda_div_used": 0.6, + "learning_rate": 2.9836319343816397e-07, + "loss": 0.0007, + "reward": 0.5144574351143092, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5144574351143092, + "reward_after_std": 0.7366812415421009, + "reward_before_mean": 1.1926300078630447, + "reward_before_std": 0.5972456242889166, + "reward_change_max": 0.0, + "reward_change_mean": -0.6781725883483887, + "reward_change_min": -1.0510307103395462, + "reward_change_std": 0.39387187361717224, + "reward_std": 0.7366812489926815, + "rewards/cosine_scaled_reward": 0.0963149992749095, + "rewards/format_reward": 1.0, + "step": 360 + }, + { + "advantage_max": 1.8617792576551437, + "advantage_mean": -2.1109979042588378e-08, + "advantage_min": -0.8186349608004093, + "advantage_std": 0.9998375251889229, + "completion_length": 1194.4791793823242, + "epoch": 0.4125714285714286, + "grad_norm": 0.3388521075248718, + "kl": 0.01670074462890625, + "lambda_div_used": 0.6, + "learning_rate": 2.9576484845877793e-07, + "loss": 0.0007, + "reward": 0.19046252546831965, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.19046252546831965, + "reward_after_std": 0.756943553686142, + "reward_before_mean": 0.6817769166082144, + "reward_before_std": 0.6761543806642294, + "reward_change_max": 0.0, + "reward_change_mean": -0.4913143888115883, + "reward_change_min": -0.8694675639271736, + "reward_change_std": 0.31585903838276863, + "reward_std": 0.7569435648620129, + "rewards/cosine_scaled_reward": -0.12786155845969915, + "rewards/format_reward": 0.9375000074505806, + "step": 361 + }, + { + "advantage_max": 1.874962106347084, + "advantage_mean": -1.1874363270436561e-08, + "advantage_min": -0.9260348118841648, + "advantage_std": 0.9998194351792336, + "completion_length": 956.8542060852051, + "epoch": 0.4137142857142857, + "grad_norm": 0.3070959448814392, + "kl": 0.012874603271484375, + "lambda_div_used": 0.6, + "learning_rate": 2.931788945420058e-07, + "loss": 0.0005, + "reward": 0.501685687340796, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.501685687340796, + "reward_after_std": 0.6610399819910526, + "reward_before_mean": 1.1869359854608774, + "reward_before_std": 0.49332827515900135, + "reward_change_max": 0.0, + "reward_change_mean": -0.6852503418922424, + "reward_change_min": -0.9486516639590263, + "reward_change_std": 0.3753964975476265, + "reward_std": 0.6610400229692459, + "rewards/cosine_scaled_reward": 0.10388467647135258, + "rewards/format_reward": 0.9791666716337204, + "step": 362 + }, + { + "advantage_max": 1.888390600681305, + "advantage_mean": -4.3461723664428575e-09, + "advantage_min": -0.825883325189352, + "advantage_std": 0.9998545944690704, + "completion_length": 1159.9166946411133, + "epoch": 0.41485714285714287, + "grad_norm": 0.3400725722312927, + "kl": 0.0212554931640625, + "lambda_div_used": 0.6, + "learning_rate": 2.9060545772359305e-07, + "loss": 0.0008, + "reward": 0.6914736162871122, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6914736162871122, + "reward_after_std": 0.7680247835814953, + "reward_before_mean": 1.4520082499366254, + "reward_before_std": 0.5585946920327842, + "reward_change_max": 0.0, + "reward_change_mean": -0.7605346217751503, + "reward_change_min": -1.129803366959095, + "reward_change_std": 0.43319543078541756, + "reward_std": 0.7680247910320759, + "rewards/cosine_scaled_reward": 0.27808745484799147, + "rewards/format_reward": 0.8958333432674408, + "step": 363 + }, + { + "advantage_max": 1.8743097186088562, + "advantage_mean": -1.3659397946064189e-08, + "advantage_min": -0.964237779378891, + "advantage_std": 0.999806247651577, + "completion_length": 1216.3125305175781, + "epoch": 0.416, + "grad_norm": 0.30530038475990295, + "kl": 0.01313018798828125, + "lambda_div_used": 0.6, + "learning_rate": 2.8804466342921987e-07, + "loss": 0.0005, + "reward": 0.10216320748440921, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.10216320748440921, + "reward_after_std": 0.5930333361029625, + "reward_before_mean": 0.5751078221946955, + "reward_before_std": 0.4904471728950739, + "reward_change_max": 0.0, + "reward_change_mean": -0.47294463589787483, + "reward_change_min": -0.7347556799650192, + "reward_change_std": 0.27974462509155273, + "reward_std": 0.5930333621799946, + "rewards/cosine_scaled_reward": -0.19161276146769524, + "rewards/format_reward": 0.9583333432674408, + "step": 364 + }, + { + "advantage_max": 1.765339434146881, + "advantage_mean": -3.3306690738754696e-16, + "advantage_min": -1.010600470006466, + "advantage_std": 0.9998663514852524, + "completion_length": 2147.541748046875, + "epoch": 0.41714285714285715, + "grad_norm": 0.8073046803474426, + "kl": 0.047985076904296875, + "lambda_div_used": 0.6, + "learning_rate": 2.854966364683872e-07, + "loss": 0.0019, + "reward": 0.03060930408537388, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.03060930408537388, + "reward_after_std": 0.854483887553215, + "reward_before_mean": 0.42119893501512706, + "reward_before_std": 0.90941421687603, + "reward_change_max": 0.0013112276792526245, + "reward_change_mean": -0.39058958552777767, + "reward_change_min": -0.8304200060665607, + "reward_change_std": 0.3401966169476509, + "reward_std": 0.8544838950037956, + "rewards/cosine_scaled_reward": -0.07065055519342422, + "rewards/format_reward": 0.5625000111758709, + "step": 365 + }, + { + "advantage_max": 1.8821345120668411, + "advantage_mean": -2.3671115845225188e-08, + "advantage_min": -0.8438076861202717, + "advantage_std": 0.9998635575175285, + "completion_length": 1135.8541870117188, + "epoch": 0.41828571428571426, + "grad_norm": 0.27431872487068176, + "kl": 0.01039886474609375, + "lambda_div_used": 0.6, + "learning_rate": 2.829615010283344e-07, + "loss": 0.0004, + "reward": 0.5668425522744656, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5668425522744656, + "reward_after_std": 0.858330711722374, + "reward_before_mean": 1.2461014911532402, + "reward_before_std": 0.7163833295926452, + "reward_change_max": 0.0, + "reward_change_mean": -0.6792589277029037, + "reward_change_min": -1.054423488676548, + "reward_change_std": 0.3998459428548813, + "reward_std": 0.858330748975277, + "rewards/cosine_scaled_reward": 0.1334673846140504, + "rewards/format_reward": 0.9791666716337204, + "step": 366 + }, + { + "advantage_max": 1.827427864074707, + "advantage_mean": -2.2351743123039114e-08, + "advantage_min": -0.9146878495812416, + "advantage_std": 0.9998784735798836, + "completion_length": 1681.7084197998047, + "epoch": 0.41942857142857143, + "grad_norm": 0.31206369400024414, + "kl": 0.02565765380859375, + "lambda_div_used": 0.6, + "learning_rate": 2.8043938066798645e-07, + "loss": 0.001, + "reward": 0.35777226043865085, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.35777226043865085, + "reward_after_std": 0.9370049685239792, + "reward_before_mean": 0.9072601944208145, + "reward_before_std": 0.892977561801672, + "reward_change_max": 0.0, + "reward_change_mean": -0.5494879521429539, + "reward_change_min": -0.9770361036062241, + "reward_change_std": 0.38045726902782917, + "reward_std": 0.9370049983263016, + "rewards/cosine_scaled_reward": 0.04738008719868958, + "rewards/format_reward": 0.8125000186264515, + "step": 367 + }, + { + "advantage_max": 1.8542225509881973, + "advantage_mean": -9.934107758624577e-09, + "advantage_min": -0.9214349538087845, + "advantage_std": 0.9998540431261063, + "completion_length": 2178.3334045410156, + "epoch": 0.4205714285714286, + "grad_norm": 0.5161811113357544, + "kl": 0.04541015625, + "lambda_div_used": 0.6, + "learning_rate": 2.7793039831193133e-07, + "loss": 0.0018, + "reward": 0.2744460329413414, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2744460329413414, + "reward_after_std": 0.8940634317696095, + "reward_before_mean": 0.7882697209715843, + "reward_before_std": 0.8432548437267542, + "reward_change_max": 0.0, + "reward_change_mean": -0.5138236656785011, + "reward_change_min": -0.9380548447370529, + "reward_change_std": 0.35526866372674704, + "reward_std": 0.8940634839236736, + "rewards/cosine_scaled_reward": 0.019134832313284278, + "rewards/format_reward": 0.7500000111758709, + "step": 368 + }, + { + "advantage_max": 1.8268895745277405, + "advantage_mean": -5.277494830036034e-09, + "advantage_min": -0.9020405560731888, + "advantage_std": 0.9998411163687706, + "completion_length": 1538.3750305175781, + "epoch": 0.4217142857142857, + "grad_norm": 0.5276854038238525, + "kl": 0.029750823974609375, + "lambda_div_used": 0.6, + "learning_rate": 2.7543467624442956e-07, + "loss": 0.0012, + "reward": 0.23423784598708153, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.23423784598708153, + "reward_after_std": 0.740181528031826, + "reward_before_mean": 0.7611953113228083, + "reward_before_std": 0.7027287185192108, + "reward_change_max": 0.0, + "reward_change_mean": -0.5269574634730816, + "reward_change_min": -0.9228874072432518, + "reward_change_std": 0.35378825664520264, + "reward_std": 0.740181565284729, + "rewards/cosine_scaled_reward": -0.036069024819880724, + "rewards/format_reward": 0.8333333507180214, + "step": 369 + }, + { + "advantage_max": 1.8950076550245285, + "advantage_mean": 6.208812908248262e-10, + "advantage_min": -0.7749098390340805, + "advantage_std": 0.9998090341687202, + "completion_length": 1539.0208892822266, + "epoch": 0.4228571428571429, + "grad_norm": 0.4318285584449768, + "kl": 0.027801513671875, + "lambda_div_used": 0.6, + "learning_rate": 2.729523361034538e-07, + "loss": 0.0011, + "reward": 0.267172476509586, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.267172476509586, + "reward_after_std": 0.5785079337656498, + "reward_before_mean": 0.835631494410336, + "reward_before_std": 0.42543186247348785, + "reward_change_max": 0.0, + "reward_change_mean": -0.5684590209275484, + "reward_change_min": -0.8694842867553234, + "reward_change_std": 0.32806423865258694, + "reward_std": 0.5785079672932625, + "rewards/cosine_scaled_reward": -0.0301009276881814, + "rewards/format_reward": 0.8958333432674408, + "step": 370 + }, + { + "advantage_max": 1.8320594280958176, + "advantage_mean": -3.616636157222075e-08, + "advantage_min": -0.8768892697989941, + "advantage_std": 0.9998387768864632, + "completion_length": 883.2500343322754, + "epoch": 0.424, + "grad_norm": 0.3734484314918518, + "kl": 0.02027130126953125, + "lambda_div_used": 0.6, + "learning_rate": 2.7048349887476037e-07, + "loss": 0.0008, + "reward": 0.432393487659283, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.432393487659283, + "reward_after_std": 0.6572475507855415, + "reward_before_mean": 1.0829788483679295, + "reward_before_std": 0.5263035781681538, + "reward_change_max": 0.0, + "reward_change_mean": -0.6505853645503521, + "reward_change_min": -0.9898973554372787, + "reward_change_std": 0.3860883489251137, + "reward_std": 0.6572475582361221, + "rewards/cosine_scaled_reward": 0.07273940369486809, + "rewards/format_reward": 0.9375000074505806, + "step": 371 + }, + { + "advantage_max": 1.8835410475730896, + "advantage_mean": -7.761022241536963e-09, + "advantage_min": -0.9432180970907211, + "advantage_std": 0.9998204559087753, + "completion_length": 1505.895851135254, + "epoch": 0.42514285714285716, + "grad_norm": 0.3226718008518219, + "kl": 0.02196502685546875, + "lambda_div_used": 0.6, + "learning_rate": 2.6802828488599294e-07, + "loss": 0.0009, + "reward": 0.45580523181706667, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.45580523181706667, + "reward_after_std": 0.629872802644968, + "reward_before_mean": 1.1213063728064299, + "reward_before_std": 0.4603481814265251, + "reward_change_max": 0.0, + "reward_change_mean": -0.6655011437833309, + "reward_change_min": -0.9581310376524925, + "reward_change_std": 0.37093730084598064, + "reward_std": 0.6298728287220001, + "rewards/cosine_scaled_reward": 0.09190318267792463, + "rewards/format_reward": 0.9375000149011612, + "step": 372 + }, + { + "advantage_max": 1.844840943813324, + "advantage_mean": 3.104408785592483e-09, + "advantage_min": -0.8586374893784523, + "advantage_std": 0.9997843205928802, + "completion_length": 970.4791946411133, + "epoch": 0.42628571428571427, + "grad_norm": 0.3689638078212738, + "kl": 0.0161895751953125, + "lambda_div_used": 0.6, + "learning_rate": 2.655868138008171e-07, + "loss": 0.0006, + "reward": 0.15545231167925522, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.15545231167925522, + "reward_after_std": 0.5446327701210976, + "reward_before_mean": 0.6714023929089308, + "reward_before_std": 0.4520675241947174, + "reward_change_max": 0.0, + "reward_change_mean": -0.5159501116722822, + "reward_change_min": -0.7823366560041904, + "reward_change_std": 0.30494451709091663, + "reward_std": 0.5446327850222588, + "rewards/cosine_scaled_reward": -0.143465468659997, + "rewards/format_reward": 0.9583333358168602, + "step": 373 + }, + { + "advantage_max": 1.8741025775671005, + "advantage_mean": -1.6142925329809543e-08, + "advantage_min": -0.9768885672092438, + "advantage_std": 0.9998114258050919, + "completion_length": 1107.9375228881836, + "epoch": 0.42742857142857144, + "grad_norm": 0.42821618914604187, + "kl": 0.01703643798828125, + "lambda_div_used": 0.6, + "learning_rate": 2.631592046130896e-07, + "loss": 0.0007, + "reward": 0.3539789589121938, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3539789589121938, + "reward_after_std": 0.5764170140028, + "reward_before_mean": 0.972320668399334, + "reward_before_std": 0.41725558042526245, + "reward_change_max": 0.0, + "reward_change_mean": -0.6183416955173016, + "reward_change_min": -0.8753276914358139, + "reward_change_std": 0.3396179787814617, + "reward_std": 0.5764170251786709, + "rewards/cosine_scaled_reward": 0.0069936420768499374, + "rewards/format_reward": 0.9583333432674408, + "step": 374 + }, + { + "advantage_max": 1.8370700776576996, + "advantage_mean": -3.1044087300813317e-09, + "advantage_min": -0.9200460538268089, + "advantage_std": 0.9998407140374184, + "completion_length": 1740.5416793823242, + "epoch": 0.42857142857142855, + "grad_norm": 0.4497852027416229, + "kl": 0.0552825927734375, + "lambda_div_used": 0.6, + "learning_rate": 2.6074557564105724e-07, + "loss": 0.0022, + "reward": 0.3787726857699454, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3787726857699454, + "reward_after_std": 0.7555015608668327, + "reward_before_mean": 0.978042071685195, + "reward_before_std": 0.6583366394042969, + "reward_change_max": 0.001976780593395233, + "reward_change_mean": -0.5992693491280079, + "reward_change_min": -0.9710818901658058, + "reward_change_std": 0.3688979558646679, + "reward_std": 0.7555015906691551, + "rewards/cosine_scaled_reward": 0.11402101069688797, + "rewards/format_reward": 0.7500000055879354, + "step": 375 + }, + { + "advantage_max": 1.866427093744278, + "advantage_mean": 0.0, + "advantage_min": -0.9301695749163628, + "advantage_std": 0.9998011514544487, + "completion_length": 1490.479232788086, + "epoch": 0.4297142857142857, + "grad_norm": 0.36968451738357544, + "kl": 0.03911590576171875, + "lambda_div_used": 0.6, + "learning_rate": 2.583460445215911e-07, + "loss": 0.0016, + "reward": 0.2085689200903289, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2085689200903289, + "reward_after_std": 0.6105855852365494, + "reward_before_mean": 0.7465545944869518, + "reward_before_std": 0.5230113845318556, + "reward_change_max": 0.0, + "reward_change_mean": -0.5379856768995523, + "reward_change_min": -0.8638027310371399, + "reward_change_std": 0.3324153460562229, + "reward_std": 0.6105856001377106, + "rewards/cosine_scaled_reward": -0.07463938370347023, + "rewards/format_reward": 0.8958333395421505, + "step": 376 + }, + { + "advantage_max": 1.8543337881565094, + "advantage_mean": 2.110997954218874e-08, + "advantage_min": -0.9540010169148445, + "advantage_std": 0.9997950345277786, + "completion_length": 1885.8334045410156, + "epoch": 0.4308571428571429, + "grad_norm": 0.45212268829345703, + "kl": 0.0592498779296875, + "lambda_div_used": 0.6, + "learning_rate": 2.5596072820445254e-07, + "loss": 0.0024, + "reward": -0.0285780755802989, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.0285780755802989, + "reward_after_std": 0.687009047716856, + "reward_before_mean": 0.3567437110468745, + "reward_before_std": 0.6372678130865097, + "reward_change_max": 0.0009665042161941528, + "reward_change_mean": -0.3853217884898186, + "reward_change_min": -0.6375828981399536, + "reward_change_std": 0.26585924066603184, + "reward_std": 0.6870090700685978, + "rewards/cosine_scaled_reward": -0.18621148075908422, + "rewards/format_reward": 0.7291666828095913, + "step": 377 + }, + { + "advantage_max": 1.7750205844640732, + "advantage_mean": -2.856055936195645e-08, + "advantage_min": -1.0438815727829933, + "advantage_std": 0.9998446851968765, + "completion_length": 1369.6667003631592, + "epoch": 0.432, + "grad_norm": 0.44347718358039856, + "kl": 0.026195526123046875, + "lambda_div_used": 0.6, + "learning_rate": 2.5358974294659373e-07, + "loss": 0.001, + "reward": 0.45271228021010756, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.45271228021010756, + "reward_after_std": 0.7842323668301105, + "reward_before_mean": 1.0912157818675041, + "reward_before_std": 0.7611095923930407, + "reward_change_max": 0.0, + "reward_change_mean": -0.6385035067796707, + "reward_change_min": -1.0665529370307922, + "reward_change_std": 0.4174453355371952, + "reward_std": 0.7842323780059814, + "rewards/cosine_scaled_reward": 0.06644121464341879, + "rewards/format_reward": 0.9583333432674408, + "step": 378 + }, + { + "advantage_max": 1.807452067732811, + "advantage_mean": -1.1175871450497255e-08, + "advantage_min": -0.9612009599804878, + "advantage_std": 0.9998452290892601, + "completion_length": 1867.4584197998047, + "epoch": 0.43314285714285716, + "grad_norm": 0.5786564946174622, + "kl": 0.04692649841308594, + "lambda_div_used": 0.6, + "learning_rate": 2.512332043064913e-07, + "loss": 0.0019, + "reward": 0.22903996147215366, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.22903996147215366, + "reward_after_std": 0.7921751923859119, + "reward_before_mean": 0.7403265759348869, + "reward_before_std": 0.7686562575399876, + "reward_change_max": 0.001318424940109253, + "reward_change_mean": -0.5112866014242172, + "reward_change_min": -0.8982795923948288, + "reward_change_std": 0.36374893598258495, + "reward_std": 0.7921752035617828, + "rewards/cosine_scaled_reward": -0.036086732521653175, + "rewards/format_reward": 0.8125000149011612, + "step": 379 + }, + { + "advantage_max": 1.8599117398262024, + "advantage_mean": 4.3461718668424965e-09, + "advantage_min": -0.9834437742829323, + "advantage_std": 0.9998585060238838, + "completion_length": 1566.6041946411133, + "epoch": 0.4342857142857143, + "grad_norm": 0.598343551158905, + "kl": 0.038066864013671875, + "lambda_div_used": 0.6, + "learning_rate": 2.488912271385139e-07, + "loss": 0.0015, + "reward": 0.28783006872981787, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.28783006872981787, + "reward_after_std": 0.8142180219292641, + "reward_before_mean": 0.8213298991322517, + "reward_before_std": 0.7334345877170563, + "reward_change_max": 0.005948290228843689, + "reward_change_mean": -0.533499825745821, + "reward_change_min": -0.8682552352547646, + "reward_change_std": 0.3408501222729683, + "reward_std": 0.8142180517315865, + "rewards/cosine_scaled_reward": -0.01641839649528265, + "rewards/format_reward": 0.854166679084301, + "step": 380 + }, + { + "advantage_max": 1.9078808277845383, + "advantage_mean": -1.4280279792000528e-08, + "advantage_min": -0.8441812470555305, + "advantage_std": 0.999834418296814, + "completion_length": 1700.0000190734863, + "epoch": 0.43542857142857144, + "grad_norm": 0.6566035151481628, + "kl": 0.05217742919921875, + "lambda_div_used": 0.6, + "learning_rate": 2.465639255873246e-07, + "loss": 0.0021, + "reward": 0.044323298148810863, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.044323298148810863, + "reward_after_std": 0.7120475806295872, + "reward_before_mean": 0.455623185262084, + "reward_before_std": 0.6162074208259583, + "reward_change_max": 0.0, + "reward_change_mean": -0.41129989735782146, + "reward_change_min": -0.6884924322366714, + "reward_change_std": 0.25980154797434807, + "reward_std": 0.7120476141571999, + "rewards/cosine_scaled_reward": -0.15760508552193642, + "rewards/format_reward": 0.7708333469927311, + "step": 381 + }, + { + "advantage_max": 1.91932113468647, + "advantage_mean": -6.208817349140361e-09, + "advantage_min": -0.8044084087014198, + "advantage_std": 0.9998442009091377, + "completion_length": 1039.5833778381348, + "epoch": 0.43657142857142855, + "grad_norm": 0.36317184567451477, + "kl": 0.012350082397460938, + "lambda_div_used": 0.6, + "learning_rate": 2.4425141308231765e-07, + "loss": 0.0005, + "reward": 0.20284398877993226, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.20284398877993226, + "reward_after_std": 0.7599193342030048, + "reward_before_mean": 0.6968522891402245, + "reward_before_std": 0.6381727494299412, + "reward_change_max": 0.0, + "reward_change_mean": -0.49400831013917923, + "reward_change_min": -0.8446781784296036, + "reward_change_std": 0.29265458323061466, + "reward_std": 0.7599193751811981, + "rewards/cosine_scaled_reward": -0.13074052496813238, + "rewards/format_reward": 0.9583333358168602, + "step": 382 + }, + { + "advantage_max": 1.8412632197141647, + "advantage_mean": -4.035731332452386e-09, + "advantage_min": -0.9618307873606682, + "advantage_std": 0.9998642280697823, + "completion_length": 1811.7709045410156, + "epoch": 0.4377142857142857, + "grad_norm": 0.5148884057998657, + "kl": 0.07571983337402344, + "lambda_div_used": 0.6, + "learning_rate": 2.4195380233209006e-07, + "loss": 0.003, + "reward": 0.3637051163241267, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3637051163241267, + "reward_after_std": 0.8799832984805107, + "reward_before_mean": 0.9258610289543867, + "reward_before_std": 0.8425909131765366, + "reward_change_max": 0.0, + "reward_change_mean": -0.5621559210121632, + "reward_change_min": -0.9556906446814537, + "reward_change_std": 0.38166039250791073, + "reward_std": 0.8799833245575428, + "rewards/cosine_scaled_reward": 0.05668050143867731, + "rewards/format_reward": 0.8125000111758709, + "step": 383 + }, + { + "advantage_max": 1.89041306078434, + "advantage_mean": -2.6077033199456423e-08, + "advantage_min": -0.83429766446352, + "advantage_std": 0.9998781979084015, + "completion_length": 1192.3125381469727, + "epoch": 0.43885714285714283, + "grad_norm": 0.3472490608692169, + "kl": 0.0273590087890625, + "lambda_div_used": 0.6, + "learning_rate": 2.3967120531894857e-07, + "loss": 0.0011, + "reward": 0.7900097626261413, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.7900097626261413, + "reward_after_std": 0.9402679018676281, + "reward_before_mean": 1.5731139779090881, + "reward_before_std": 0.7669761152938008, + "reward_change_max": 0.00027222931385040283, + "reward_change_mean": -0.7831042222678661, + "reward_change_min": -1.2078073993325233, + "reward_change_std": 0.46871116384863853, + "reward_std": 0.9402679279446602, + "rewards/cosine_scaled_reward": 0.307390327565372, + "rewards/format_reward": 0.9583333358168602, + "step": 384 + }, + { + "advantage_max": 1.8527950644493103, + "advantage_mean": -1.6142925107764938e-08, + "advantage_min": -0.8780858963727951, + "advantage_std": 0.9998686984181404, + "completion_length": 1546.958381652832, + "epoch": 0.44, + "grad_norm": 0.42252200841903687, + "kl": 0.033329010009765625, + "lambda_div_used": 0.6, + "learning_rate": 2.374037332934512e-07, + "loss": 0.0013, + "reward": 0.3363357661291957, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3363357661291957, + "reward_after_std": 0.9530099332332611, + "reward_before_mean": 0.8679481521248817, + "reward_before_std": 0.8937208689749241, + "reward_change_max": 0.0, + "reward_change_mean": -0.5316124074161053, + "reward_change_min": -0.9706587940454483, + "reward_change_std": 0.37425510212779045, + "reward_std": 0.9530099704861641, + "rewards/cosine_scaled_reward": 0.006890743970870972, + "rewards/format_reward": 0.854166679084301, + "step": 385 + }, + { + "advantage_max": 1.8089945912361145, + "advantage_mean": -2.980232260973992e-08, + "advantage_min": -0.9716885611414909, + "advantage_std": 0.9998600035905838, + "completion_length": 1505.354232788086, + "epoch": 0.44114285714285717, + "grad_norm": 0.46533486247062683, + "kl": 0.04397773742675781, + "lambda_div_used": 0.6, + "learning_rate": 2.3515149676898552e-07, + "loss": 0.0018, + "reward": 0.563620753120631, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.563620753120631, + "reward_after_std": 0.8097805194556713, + "reward_before_mean": 1.2603773300070316, + "reward_before_std": 0.7409437410533428, + "reward_change_max": 0.0007016360759735107, + "reward_change_mean": -0.6967565380036831, + "reward_change_min": -1.082048561424017, + "reward_change_std": 0.4333069808781147, + "reward_std": 0.8097805455327034, + "rewards/cosine_scaled_reward": 0.18227194994688034, + "rewards/format_reward": 0.8958333395421505, + "step": 386 + }, + { + "advantage_max": 1.8622557073831558, + "advantage_mean": -2.4835267176115394e-09, + "advantage_min": -0.8337085545063019, + "advantage_std": 0.9998394101858139, + "completion_length": 1601.8125610351562, + "epoch": 0.4422857142857143, + "grad_norm": 0.5599687099456787, + "kl": 0.0380706787109375, + "lambda_div_used": 0.6, + "learning_rate": 2.3291460551638237e-07, + "loss": 0.0015, + "reward": 0.24382829433307052, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.24382829433307052, + "reward_after_std": 0.7119114100933075, + "reward_before_mean": 0.7761804088950157, + "reward_before_std": 0.6253592558205128, + "reward_change_max": 0.0, + "reward_change_mean": -0.5323521457612514, + "reward_change_min": -0.9017984233796597, + "reward_change_std": 0.34474920108914375, + "reward_std": 0.7119114361703396, + "rewards/cosine_scaled_reward": -0.028576454147696495, + "rewards/format_reward": 0.8333333395421505, + "step": 387 + }, + { + "advantage_max": 1.8835696578025818, + "advantage_mean": -4.470348546892211e-08, + "advantage_min": -0.8850356787443161, + "advantage_std": 0.9998294040560722, + "completion_length": 1042.2291946411133, + "epoch": 0.44342857142857145, + "grad_norm": 0.33819589018821716, + "kl": 0.0192108154296875, + "lambda_div_used": 0.6, + "learning_rate": 2.306931685585657e-07, + "loss": 0.0008, + "reward": 0.40724813751876354, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.40724813751876354, + "reward_after_std": 0.7526779770851135, + "reward_before_mean": 1.0167050827294588, + "reward_before_std": 0.6314396969974041, + "reward_change_max": 0.0, + "reward_change_mean": -0.6094569526612759, + "reward_change_min": -0.9567462503910065, + "reward_change_std": 0.35661482252180576, + "reward_std": 0.7526780031621456, + "rewards/cosine_scaled_reward": 0.018769189715385437, + "rewards/format_reward": 0.9791666716337204, + "step": 388 + }, + { + "advantage_max": 1.9048434495925903, + "advantage_mean": -2.297262352568552e-08, + "advantage_min": -0.835426326841116, + "advantage_std": 0.9998196735978127, + "completion_length": 1316.6250228881836, + "epoch": 0.44457142857142856, + "grad_norm": 0.38188523054122925, + "kl": 0.02446746826171875, + "lambda_div_used": 0.6, + "learning_rate": 2.2848729416523859e-07, + "loss": 0.001, + "reward": 0.32476662658154964, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.32476662658154964, + "reward_after_std": 0.6799643561244011, + "reward_before_mean": 0.9025517366826534, + "reward_before_std": 0.5456170169636607, + "reward_change_max": 0.0, + "reward_change_mean": -0.5777850933372974, + "reward_change_min": -0.8595723137259483, + "reward_change_std": 0.3257438950240612, + "reward_std": 0.6799643859267235, + "rewards/cosine_scaled_reward": -0.04872416495345533, + "rewards/format_reward": 1.0, + "step": 389 + }, + { + "advantage_max": 1.8400477319955826, + "advantage_mean": -6.829698917520943e-09, + "advantage_min": -0.854744978249073, + "advantage_std": 0.9998657703399658, + "completion_length": 1577.2500457763672, + "epoch": 0.44571428571428573, + "grad_norm": 0.6859716176986694, + "kl": 0.0335693359375, + "lambda_div_used": 0.6, + "learning_rate": 2.2629708984760706e-07, + "loss": 0.0013, + "reward": 0.1753460403997451, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1753460403997451, + "reward_after_std": 0.871589045971632, + "reward_before_mean": 0.6376499533653259, + "reward_before_std": 0.8287604749202728, + "reward_change_max": 0.0008363723754882812, + "reward_change_mean": -0.46230391412973404, + "reward_change_min": -0.935205452144146, + "reward_change_std": 0.3333991579711437, + "reward_std": 0.8715890794992447, + "rewards/cosine_scaled_reward": -0.07700837170705199, + "rewards/format_reward": 0.791666679084301, + "step": 390 + }, + { + "advantage_max": 1.912625327706337, + "advantage_mean": -2.173086099954702e-09, + "advantage_min": -0.7925033271312714, + "advantage_std": 0.9998733997344971, + "completion_length": 1489.00004196167, + "epoch": 0.44685714285714284, + "grad_norm": 0.44128429889678955, + "kl": 0.06465911865234375, + "lambda_div_used": 0.6, + "learning_rate": 2.2412266235313973e-07, + "loss": 0.0026, + "reward": 0.348428251221776, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.348428251221776, + "reward_after_std": 0.9207301996648312, + "reward_before_mean": 0.8853506334125996, + "reward_before_std": 0.7893150560557842, + "reward_change_max": 0.00031003355979919434, + "reward_change_mean": -0.536922387778759, + "reward_change_min": -0.9096999578177929, + "reward_change_std": 0.33434988744556904, + "reward_std": 0.9207301996648312, + "rewards/cosine_scaled_reward": 0.03642530972138047, + "rewards/format_reward": 0.8125000111758709, + "step": 391 + }, + { + "advantage_max": 1.841011866927147, + "advantage_mean": -4.967053990334591e-09, + "advantage_min": -0.9479911550879478, + "advantage_std": 0.9998376369476318, + "completion_length": 1231.2500381469727, + "epoch": 0.448, + "grad_norm": 0.41627851128578186, + "kl": 0.026950836181640625, + "lambda_div_used": 0.6, + "learning_rate": 2.2196411766036487e-07, + "loss": 0.0011, + "reward": 0.238603868172504, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.238603868172504, + "reward_after_std": 0.7660604529082775, + "reward_before_mean": 0.7568782866001129, + "reward_before_std": 0.7045793607831001, + "reward_change_max": 0.0005902126431465149, + "reward_change_mean": -0.5182744301855564, + "reward_change_min": -0.850563645362854, + "reward_change_std": 0.33581987768411636, + "reward_std": 0.7660604864358902, + "rewards/cosine_scaled_reward": -0.07989420369267464, + "rewards/format_reward": 0.916666679084301, + "step": 392 + }, + { + "advantage_max": 1.850405141711235, + "advantage_mean": -4.967053990334591e-09, + "advantage_min": -0.813638836145401, + "advantage_std": 0.9998711571097374, + "completion_length": 1491.2292175292969, + "epoch": 0.4491428571428571, + "grad_norm": 0.46879175305366516, + "kl": 0.04475212097167969, + "lambda_div_used": 0.6, + "learning_rate": 2.1982156097370557e-07, + "loss": 0.0018, + "reward": 0.22369152214378119, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.22369152214378119, + "reward_after_std": 0.9152826145291328, + "reward_before_mean": 0.6987034790217876, + "reward_before_std": 0.8574752546846867, + "reward_change_max": 0.0006786957383155823, + "reward_change_mean": -0.47501196525990963, + "reward_change_min": -0.8072455041110516, + "reward_change_std": 0.3139333399012685, + "reward_std": 0.9152826257050037, + "rewards/cosine_scaled_reward": -0.06731493026018143, + "rewards/format_reward": 0.833333333954215, + "step": 393 + }, + { + "advantage_max": 1.8343770503997803, + "advantage_mean": 9.313225746154785e-10, + "advantage_min": -1.0088716968894005, + "advantage_std": 0.9998288080096245, + "completion_length": 1652.5625534057617, + "epoch": 0.4502857142857143, + "grad_norm": 0.7100973725318909, + "kl": 0.06395339965820312, + "lambda_div_used": 0.6, + "learning_rate": 2.1769509671835223e-07, + "loss": 0.0026, + "reward": -0.015374501468613744, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.015374501468613744, + "reward_after_std": 0.6524021811783314, + "reward_before_mean": 0.38132119935471565, + "reward_before_std": 0.6267807520925999, + "reward_change_max": 0.005344957113265991, + "reward_change_mean": -0.39669569209218025, + "reward_change_min": -0.7064265944063663, + "reward_change_std": 0.28260448575019836, + "reward_std": 0.6524022221565247, + "rewards/cosine_scaled_reward": -0.18433942459523678, + "rewards/format_reward": 0.7500000186264515, + "step": 394 + }, + { + "advantage_max": 1.9344320595264435, + "advantage_mean": -1.1796753018877837e-08, + "advantage_min": -0.7464034222066402, + "advantage_std": 0.9998317658901215, + "completion_length": 1422.31254196167, + "epoch": 0.4514285714285714, + "grad_norm": 0.5548021197319031, + "kl": 0.04305839538574219, + "lambda_div_used": 0.6, + "learning_rate": 2.1558482853517253e-07, + "loss": 0.0017, + "reward": 0.12881008815020323, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12881008815020323, + "reward_after_std": 0.7306486591696739, + "reward_before_mean": 0.5857934057712555, + "reward_before_std": 0.6257657716050744, + "reward_change_max": 0.0, + "reward_change_mean": -0.45698332041502, + "reward_change_min": -0.7511992193758488, + "reward_change_std": 0.30648272577673197, + "reward_std": 0.7306486964225769, + "rewards/cosine_scaled_reward": -0.07168664503842592, + "rewards/format_reward": 0.7291666772216558, + "step": 395 + }, + { + "advantage_max": 1.8720027953386307, + "advantage_mean": -1.5522043372850902e-08, + "advantage_min": -0.897967129945755, + "advantage_std": 0.9998262673616409, + "completion_length": 1115.2500305175781, + "epoch": 0.45257142857142857, + "grad_norm": 0.4723523259162903, + "kl": 0.023822784423828125, + "lambda_div_used": 0.6, + "learning_rate": 2.134908592756607e-07, + "loss": 0.001, + "reward": 0.3313878992339596, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3313878992339596, + "reward_after_std": 0.6471819877624512, + "reward_before_mean": 0.9213565066456795, + "reward_before_std": 0.5130888111889362, + "reward_change_max": 3.838539123535156e-05, + "reward_change_mean": -0.5899685919284821, + "reward_change_min": -0.8697409331798553, + "reward_change_std": 0.3338143788278103, + "reward_std": 0.6471820063889027, + "rewards/cosine_scaled_reward": 0.03359490446746349, + "rewards/format_reward": 0.8541666716337204, + "step": 396 + }, + { + "advantage_max": 1.883902370929718, + "advantage_mean": -3.632158152022669e-08, + "advantage_min": -0.9016060680150986, + "advantage_std": 0.9997219517827034, + "completion_length": 1352.0625228881836, + "epoch": 0.45371428571428574, + "grad_norm": 0.37755143642425537, + "kl": 0.04022216796875, + "lambda_div_used": 0.6, + "learning_rate": 2.1141329099692406e-07, + "loss": 0.0016, + "reward": -0.04198963730596006, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.04198963730596006, + "reward_after_std": 0.5525118494406343, + "reward_before_mean": 0.3588534388691187, + "reward_before_std": 0.45337067916989326, + "reward_change_max": 0.0, + "reward_change_mean": -0.40084310434758663, + "reward_change_min": -0.6106414273381233, + "reward_change_std": 0.22512250347062945, + "reward_std": 0.5525118727236986, + "rewards/cosine_scaled_reward": -0.19557328848168254, + "rewards/format_reward": 0.7500000055879354, + "step": 397 + }, + { + "advantage_max": 1.7940724939107895, + "advantage_mean": 1.8781671828893565e-08, + "advantage_min": -0.9255733340978622, + "advantage_std": 0.9998294785618782, + "completion_length": 1550.7500457763672, + "epoch": 0.45485714285714285, + "grad_norm": 0.37187108397483826, + "kl": 0.07955551147460938, + "lambda_div_used": 0.6, + "learning_rate": 2.0935222495670968e-07, + "loss": 0.0032, + "reward": 0.01694987085647881, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.01694987085647881, + "reward_after_std": 0.7360811270773411, + "reward_before_mean": 0.4198214989155531, + "reward_before_std": 0.7361685782670975, + "reward_change_max": 0.0, + "reward_change_mean": -0.4028716376051307, + "reward_change_min": -0.7846706658601761, + "reward_change_std": 0.30554668232798576, + "reward_std": 0.7360811606049538, + "rewards/cosine_scaled_reward": -0.12342259637080133, + "rewards/format_reward": 0.6666666734963655, + "step": 398 + }, + { + "advantage_max": 1.8436774611473083, + "advantage_mean": -1.0244548376281415e-08, + "advantage_min": -0.8891482055187225, + "advantage_std": 0.9998530372977257, + "completion_length": 1050.0000228881836, + "epoch": 0.456, + "grad_norm": 0.4016158878803253, + "kl": 0.01065826416015625, + "lambda_div_used": 0.6, + "learning_rate": 2.0730776160846853e-07, + "loss": 0.0004, + "reward": 0.5273470878601074, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5273470878601074, + "reward_after_std": 0.960618756711483, + "reward_before_mean": 1.1682646535336971, + "reward_before_std": 0.8969470211304724, + "reward_change_max": 0.0, + "reward_change_mean": -0.6409175284206867, + "reward_change_min": -1.093371707946062, + "reward_change_std": 0.41930639930069447, + "reward_std": 0.9606187716126442, + "rewards/cosine_scaled_reward": 0.08413228066638112, + "rewards/format_reward": 1.0, + "step": 399 + }, + { + "advantage_max": 1.8732015937566757, + "advantage_mean": -2.3593505593666464e-08, + "advantage_min": -0.9000465795397758, + "advantage_std": 0.9998682737350464, + "completion_length": 1030.270881652832, + "epoch": 0.45714285714285713, + "grad_norm": 0.6417791843414307, + "kl": 0.0258026123046875, + "lambda_div_used": 0.6, + "learning_rate": 2.0528000059645995e-07, + "loss": 0.001, + "reward": 0.6478320201858878, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6478320201858878, + "reward_after_std": 0.9242336377501488, + "reward_before_mean": 1.3621239997446537, + "reward_before_std": 0.8079969696700573, + "reward_change_max": 0.0, + "reward_change_mean": -0.7142920345067978, + "reward_change_min": -1.1032118797302246, + "reward_change_std": 0.4413549527525902, + "reward_std": 0.9242336414754391, + "rewards/cosine_scaled_reward": 0.20189532358199358, + "rewards/format_reward": 0.9583333358168602, + "step": 400 + }, + { + "advantage_max": 1.8797823637723923, + "advantage_mean": -2.359350581571107e-08, + "advantage_min": -0.8693061619997025, + "advantage_std": 0.9998289421200752, + "completion_length": 1648.9792022705078, + "epoch": 0.4582857142857143, + "grad_norm": 0.8710258603096008, + "kl": 0.07901382446289062, + "lambda_div_used": 0.6, + "learning_rate": 2.032690407508949e-07, + "loss": 0.0032, + "reward": 0.18920016940683126, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.18920016940683126, + "reward_after_std": 0.6232438683509827, + "reward_before_mean": 0.7045781388878822, + "reward_before_std": 0.500193178653717, + "reward_change_max": 0.001879081130027771, + "reward_change_mean": -0.515377989038825, + "reward_change_min": -0.8118748292326927, + "reward_change_std": 0.30811250768601894, + "reward_std": 0.6232438869774342, + "rewards/cosine_scaled_reward": -0.043544284999370575, + "rewards/format_reward": 0.7916666753590107, + "step": 401 + }, + { + "advantage_max": 1.8284994959831238, + "advantage_mean": -4.4703484247676784e-08, + "advantage_min": -0.9610132426023483, + "advantage_std": 0.9998360648751259, + "completion_length": 1549.2500534057617, + "epoch": 0.4594285714285714, + "grad_norm": 0.7397347092628479, + "kl": 0.09239578247070312, + "lambda_div_used": 0.6, + "learning_rate": 2.0127498008311922e-07, + "loss": 0.0037, + "reward": 0.2838644115254283, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2838644115254283, + "reward_after_std": 0.6427106708288193, + "reward_before_mean": 0.8501286339014769, + "reward_before_std": 0.561686310917139, + "reward_change_max": 0.0, + "reward_change_mean": -0.5662642568349838, + "reward_change_min": -0.9427123293280602, + "reward_change_std": 0.3427322842180729, + "reward_std": 0.6427106931805611, + "rewards/cosine_scaled_reward": 0.039647649973630905, + "rewards/format_reward": 0.7708333544433117, + "step": 402 + }, + { + "advantage_max": 1.9294809103012085, + "advantage_mean": 6.208817349140361e-09, + "advantage_min": -0.7606799304485321, + "advantage_std": 0.9998130202293396, + "completion_length": 1098.4167022705078, + "epoch": 0.4605714285714286, + "grad_norm": 0.7467202544212341, + "kl": 0.0480804443359375, + "lambda_div_used": 0.6, + "learning_rate": 1.9929791578083655e-07, + "loss": 0.0019, + "reward": 0.3443166771903634, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3443166771903634, + "reward_after_std": 0.5857836827635765, + "reward_before_mean": 0.9529151804745197, + "reward_before_std": 0.40462041553109884, + "reward_change_max": 0.0, + "reward_change_mean": -0.6085985079407692, + "reward_change_min": -0.8830667324364185, + "reward_change_std": 0.3298831805586815, + "reward_std": 0.5857837088406086, + "rewards/cosine_scaled_reward": 0.05979091301560402, + "rewards/format_reward": 0.8333333414047956, + "step": 403 + }, + { + "advantage_max": 1.8348060548305511, + "advantage_mean": -2.0799537647775423e-08, + "advantage_min": -0.9691257327795029, + "advantage_std": 0.999817743897438, + "completion_length": 1306.0416946411133, + "epoch": 0.4617142857142857, + "grad_norm": 0.43737542629241943, + "kl": 0.037811279296875, + "lambda_div_used": 0.6, + "learning_rate": 1.9733794420337213e-07, + "loss": 0.0015, + "reward": 0.34468303504399955, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.34468303504399955, + "reward_after_std": 0.5768958441913128, + "reward_before_mean": 0.9605327546596527, + "reward_before_std": 0.44061626866459846, + "reward_change_max": 0.0020405128598213196, + "reward_change_mean": -0.6158497631549835, + "reward_change_min": -0.9091715142130852, + "reward_change_std": 0.3500679489225149, + "reward_std": 0.576895859092474, + "rewards/cosine_scaled_reward": 0.0010997112840414047, + "rewards/format_reward": 0.9583333432674408, + "step": 404 + }, + { + "advantage_max": 1.8543145060539246, + "advantage_mean": -3.1664968203060084e-08, + "advantage_min": -0.898287508636713, + "advantage_std": 0.9998418316245079, + "completion_length": 1262.7500343322754, + "epoch": 0.46285714285714286, + "grad_norm": 0.570946991443634, + "kl": 0.04802703857421875, + "lambda_div_used": 0.6, + "learning_rate": 1.9539516087697517e-07, + "loss": 0.0019, + "reward": 0.6255162106826901, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6255162106826901, + "reward_after_std": 0.7487674653530121, + "reward_before_mean": 1.3644494786858559, + "reward_before_std": 0.5858640410006046, + "reward_change_max": 0.0, + "reward_change_mean": -0.7389333806931973, + "reward_change_min": -1.0935213565826416, + "reward_change_std": 0.43595005199313164, + "reward_std": 0.7487674877047539, + "rewards/cosine_scaled_reward": 0.22389142867177725, + "rewards/format_reward": 0.916666679084301, + "step": 405 + }, + { + "advantage_max": 1.8656399846076965, + "advantage_mean": -9.934107980669182e-09, + "advantage_min": -0.9580504596233368, + "advantage_std": 0.9998698681592941, + "completion_length": 1351.3542098999023, + "epoch": 0.464, + "grad_norm": 0.30655261874198914, + "kl": 0.04695701599121094, + "lambda_div_used": 0.6, + "learning_rate": 1.934696604901642e-07, + "loss": 0.0019, + "reward": 0.5224170899018645, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.5224170899018645, + "reward_after_std": 0.9001653417944908, + "reward_before_mean": 1.1674301028251648, + "reward_before_std": 0.7968062944710255, + "reward_change_max": 0.0, + "reward_change_mean": -0.6450129598379135, + "reward_change_min": -1.0377557501196861, + "reward_change_std": 0.3952809479087591, + "reward_std": 0.9001653641462326, + "rewards/cosine_scaled_reward": 0.10454833297990263, + "rewards/format_reward": 0.9583333358168602, + "step": 406 + }, + { + "advantage_max": 1.8426674157381058, + "advantage_mean": 2.3283066585833012e-09, + "advantage_min": -0.8966666460037231, + "advantage_std": 0.9998308792710304, + "completion_length": 1636.6250305175781, + "epoch": 0.46514285714285714, + "grad_norm": 0.524170994758606, + "kl": 0.07071685791015625, + "lambda_div_used": 0.6, + "learning_rate": 1.915615368891117e-07, + "loss": 0.0028, + "reward": 0.2841284740716219, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2841284740716219, + "reward_after_std": 0.7291165739297867, + "reward_before_mean": 0.8379490524530411, + "reward_before_std": 0.6585894171148539, + "reward_change_max": 0.0015131905674934387, + "reward_change_mean": -0.5538205932825804, + "reward_change_min": -0.9359328970313072, + "reward_change_std": 0.3618352375924587, + "reward_std": 0.7291165962815285, + "rewards/cosine_scaled_reward": -0.008108798414468765, + "rewards/format_reward": 0.8541666753590107, + "step": 407 + }, + { + "advantage_max": 1.8449404537677765, + "advantage_mean": -2.1109978542988017e-08, + "advantage_min": -0.8859404623508453, + "advantage_std": 0.9998802319169044, + "completion_length": 1421.2083854675293, + "epoch": 0.4662857142857143, + "grad_norm": 0.5737088322639465, + "kl": 0.049816131591796875, + "lambda_div_used": 0.6, + "learning_rate": 1.8967088307307e-07, + "loss": 0.002, + "reward": 0.5460685240104795, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.5460685240104795, + "reward_after_std": 0.9580986090004444, + "reward_before_mean": 1.1958689044695348, + "reward_before_std": 0.8734465874731541, + "reward_change_max": 0.0, + "reward_change_mean": -0.6498003713786602, + "reward_change_min": -1.1185031943023205, + "reward_change_std": 0.413064856082201, + "reward_std": 0.9580986239016056, + "rewards/cosine_scaled_reward": 0.1708511160686612, + "rewards/format_reward": 0.8541666697710752, + "step": 408 + }, + { + "advantage_max": 1.8438959568738937, + "advantage_mean": -7.450580818968433e-09, + "advantage_min": -1.0017603039741516, + "advantage_std": 0.9998411983251572, + "completion_length": 1914.6875457763672, + "epoch": 0.4674285714285714, + "grad_norm": 0.48147112131118774, + "kl": 0.08876800537109375, + "lambda_div_used": 0.6, + "learning_rate": 1.8779779118983867e-07, + "loss": 0.0036, + "reward": 0.13925567595288157, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.13925567595288157, + "reward_after_std": 0.7202729880809784, + "reward_before_mean": 0.6109248884022236, + "reward_before_std": 0.6696692258119583, + "reward_change_max": 0.0009178891777992249, + "reward_change_mean": -0.4716692119836807, + "reward_change_min": -0.7645023390650749, + "reward_change_std": 0.306436350569129, + "reward_std": 0.720272995531559, + "rewards/cosine_scaled_reward": -0.09037090092897415, + "rewards/format_reward": 0.7916666828095913, + "step": 409 + }, + { + "advantage_max": 1.7967534363269806, + "advantage_mean": -4.656615093523442e-10, + "advantage_min": -0.9627714604139328, + "advantage_std": 0.9998459294438362, + "completion_length": 1528.6667137145996, + "epoch": 0.4685714285714286, + "grad_norm": 0.7331544756889343, + "kl": 0.0760650634765625, + "lambda_div_used": 0.6, + "learning_rate": 1.8594235253127372e-07, + "loss": 0.003, + "reward": 0.42353655165061355, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.42353655165061355, + "reward_after_std": 0.7990189306437969, + "reward_before_mean": 1.042414367198944, + "reward_before_std": 0.7414693534374237, + "reward_change_max": 0.00046183913946151733, + "reward_change_mean": -0.6188778132200241, + "reward_change_min": -1.0362925231456757, + "reward_change_std": 0.405984902754426, + "reward_std": 0.7990189418196678, + "rewards/cosine_scaled_reward": 0.05245716869831085, + "rewards/format_reward": 0.9375000149011612, + "step": 410 + }, + { + "advantage_max": 1.8948615789413452, + "advantage_mean": 1.2417634254191512e-08, + "advantage_min": -0.7667038217186928, + "advantage_std": 0.999861590564251, + "completion_length": 1889.2708854675293, + "epoch": 0.4697142857142857, + "grad_norm": 0.6439375877380371, + "kl": 0.09022903442382812, + "lambda_div_used": 0.6, + "learning_rate": 1.8410465752883758e-07, + "loss": 0.0036, + "reward": 0.07135952671524137, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.07135952671524137, + "reward_after_std": 0.8766655959188938, + "reward_before_mean": 0.47151265665888786, + "reward_before_std": 0.8228470720350742, + "reward_change_max": 0.0, + "reward_change_mean": -0.40015316288918257, + "reward_change_min": -0.7908356711268425, + "reward_change_std": 0.29624347295612097, + "reward_std": 0.8766656406223774, + "rewards/cosine_scaled_reward": -0.12882700935006142, + "rewards/format_reward": 0.7291666679084301, + "step": 411 + }, + { + "advantage_max": 1.9090563207864761, + "advantage_mean": -2.6697914767837005e-08, + "advantage_min": -0.7822853326797485, + "advantage_std": 0.9998724535107613, + "completion_length": 1202.1875381469727, + "epoch": 0.47085714285714286, + "grad_norm": 0.670369565486908, + "kl": 0.04871368408203125, + "lambda_div_used": 0.6, + "learning_rate": 1.822847957491922e-07, + "loss": 0.0019, + "reward": 0.22458704718155786, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.22458704718155786, + "reward_after_std": 0.908806387335062, + "reward_before_mean": 0.6955942697823048, + "reward_before_std": 0.8164355047047138, + "reward_change_max": 0.0008294880390167236, + "reward_change_mean": -0.471007265150547, + "reward_change_min": -0.9000264182686806, + "reward_change_std": 0.31314732879400253, + "reward_std": 0.9088063985109329, + "rewards/cosine_scaled_reward": -0.11053620371967554, + "rewards/format_reward": 0.916666679084301, + "step": 412 + }, + { + "advantage_max": 1.8098042160272598, + "advantage_mean": 1.2417633588057697e-09, + "advantage_min": -0.9544949308037758, + "advantage_std": 0.9998386353254318, + "completion_length": 1417.1042137145996, + "epoch": 0.472, + "grad_norm": 0.3730092942714691, + "kl": 0.0497283935546875, + "lambda_div_used": 0.6, + "learning_rate": 1.804828558898332e-07, + "loss": 0.002, + "reward": 0.2984967448282987, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2984967448282987, + "reward_after_std": 0.7107267417013645, + "reward_before_mean": 0.8657870907336473, + "reward_before_std": 0.6707973293960094, + "reward_change_max": 0.0, + "reward_change_mean": -0.5672903321683407, + "reward_change_min": -0.9352284893393517, + "reward_change_std": 0.36285562813282013, + "reward_std": 0.7107267566025257, + "rewards/cosine_scaled_reward": -0.004606468603014946, + "rewards/format_reward": 0.8750000037252903, + "step": 413 + }, + { + "advantage_max": 1.858881652355194, + "advantage_mean": 5.587935225648266e-09, + "advantage_min": -0.8481370061635971, + "advantage_std": 0.9998633489012718, + "completion_length": 1995.2500457763672, + "epoch": 0.47314285714285714, + "grad_norm": 0.3799186944961548, + "kl": 0.0695343017578125, + "lambda_div_used": 0.6, + "learning_rate": 1.7869892577476722e-07, + "loss": 0.0028, + "reward": 0.12761934008449316, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.12761934008449316, + "reward_after_std": 0.8530314080417156, + "reward_before_mean": 0.5639428496360779, + "reward_before_std": 0.7966863811016083, + "reward_change_max": 0.0, + "reward_change_mean": -0.43632352352142334, + "reward_change_min": -0.7810782827436924, + "reward_change_std": 0.29841686226427555, + "reward_std": 0.853031437844038, + "rewards/cosine_scaled_reward": -0.10344524041283876, + "rewards/format_reward": 0.7708333414047956, + "step": 414 + }, + { + "advantage_max": 1.806399554014206, + "advantage_mean": -1.6142924996742636e-08, + "advantage_min": -1.0417326241731644, + "advantage_std": 0.9998660162091255, + "completion_length": 1612.1667022705078, + "epoch": 0.4742857142857143, + "grad_norm": 1.021669864654541, + "kl": 0.0842742919921875, + "lambda_div_used": 0.6, + "learning_rate": 1.7693309235023127e-07, + "loss": 0.0034, + "reward": 0.1636495697312057, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1636495697312057, + "reward_after_std": 0.8222077488899231, + "reward_before_mean": 0.6326027903705835, + "reward_before_std": 0.801915667951107, + "reward_change_max": 0.0015484318137168884, + "reward_change_mean": -0.4689532145857811, + "reward_change_min": -0.8480212837457657, + "reward_change_std": 0.3447183482348919, + "reward_std": 0.8222077712416649, + "rewards/cosine_scaled_reward": -0.0795319527387619, + "rewards/format_reward": 0.7916666865348816, + "step": 415 + }, + { + "advantage_max": 1.946450263261795, + "advantage_mean": 4.967053879312289e-09, + "advantage_min": -0.6989743858575821, + "advantage_std": 0.999872125685215, + "completion_length": 1185.583381652832, + "epoch": 0.4754285714285714, + "grad_norm": 0.5290063619613647, + "kl": 0.032833099365234375, + "lambda_div_used": 0.6, + "learning_rate": 1.7518544168045524e-07, + "loss": 0.0013, + "reward": 0.4655949706211686, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4655949706211686, + "reward_after_std": 0.915255781263113, + "reward_before_mean": 1.0712868720293045, + "reward_before_std": 0.7395226247608662, + "reward_change_max": 0.0007962062954902649, + "reward_change_mean": -0.6056919172406197, + "reward_change_min": -1.005474604666233, + "reward_change_std": 0.36948048509657383, + "reward_std": 0.9152557961642742, + "rewards/cosine_scaled_reward": 0.0668934234417975, + "rewards/format_reward": 0.9375000074505806, + "step": 416 + }, + { + "advantage_max": 1.818949431180954, + "advantage_mean": -1.3659397946064189e-08, + "advantage_min": -1.0677797496318817, + "advantage_std": 0.9998498633503914, + "completion_length": 1563.0000228881836, + "epoch": 0.4765714285714286, + "grad_norm": 0.9102173447608948, + "kl": 0.0742950439453125, + "lambda_div_used": 0.6, + "learning_rate": 1.7345605894346726e-07, + "loss": 0.003, + "reward": 0.1907540822867304, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1907540822867304, + "reward_after_std": 0.7381607219576836, + "reward_before_mean": 0.6873529492877424, + "reward_before_std": 0.675433199852705, + "reward_change_max": 0.0027615725994110107, + "reward_change_mean": -0.49659889191389084, + "reward_change_min": -0.8365379236638546, + "reward_change_std": 0.32951972633600235, + "reward_std": 0.7381607331335545, + "rewards/cosine_scaled_reward": -0.052156862802803516, + "rewards/format_reward": 0.7916666828095913, + "step": 417 + }, + { + "advantage_max": 1.9242968559265137, + "advantage_mean": -5.091230215192866e-08, + "advantage_min": -0.7926112860441208, + "advantage_std": 0.9998694732785225, + "completion_length": 1250.062557220459, + "epoch": 0.4777142857142857, + "grad_norm": 0.781521201133728, + "kl": 0.0989990234375, + "lambda_div_used": 0.6, + "learning_rate": 1.7174502842694212e-07, + "loss": 0.004, + "reward": 0.7606905307620764, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.7606905307620764, + "reward_after_std": 0.8790313825011253, + "reward_before_mean": 1.541863800957799, + "reward_before_std": 0.6521397422766313, + "reward_change_max": 0.0, + "reward_change_mean": -0.7811733037233353, + "reward_change_min": -1.084649033844471, + "reward_change_std": 0.4253282528370619, + "reward_std": 0.8790314197540283, + "rewards/cosine_scaled_reward": 0.30218187253922224, + "rewards/format_reward": 0.9375000074505806, + "step": 418 + }, + { + "advantage_max": 1.8819816559553146, + "advantage_mean": -3.725290076417309e-09, + "advantage_min": -0.7932135835289955, + "advantage_std": 0.9998789504170418, + "completion_length": 1468.895881652832, + "epoch": 0.47885714285714287, + "grad_norm": 0.7229182720184326, + "kl": 0.0759429931640625, + "lambda_div_used": 0.6, + "learning_rate": 1.7005243352409333e-07, + "loss": 0.003, + "reward": 0.35275646578520536, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.35275646578520536, + "reward_after_std": 0.9550173804163933, + "reward_before_mean": 0.8900185525417328, + "reward_before_std": 0.8626913130283356, + "reward_change_max": 0.0007177069783210754, + "reward_change_mean": -0.5372620560228825, + "reward_change_min": -0.9783233143389225, + "reward_change_std": 0.3662260863929987, + "reward_std": 0.9550173878669739, + "rewards/cosine_scaled_reward": 0.0595925732050091, + "rewards/format_reward": 0.7708333414047956, + "step": 419 + }, + { + "advantage_max": 1.8907837122678757, + "advantage_mean": -1.5832484323574647e-08, + "advantage_min": -0.8471547365188599, + "advantage_std": 0.9998333230614662, + "completion_length": 858.0625228881836, + "epoch": 0.48, + "grad_norm": 0.427143931388855, + "kl": 0.016613006591796875, + "lambda_div_used": 0.6, + "learning_rate": 1.6837835672960831e-07, + "loss": 0.0007, + "reward": 0.3101103331428021, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3101103331428021, + "reward_after_std": 0.6953030377626419, + "reward_before_mean": 0.8771585412323475, + "reward_before_std": 0.5664926655590534, + "reward_change_max": 0.0, + "reward_change_mean": -0.5670481845736504, + "reward_change_min": -0.9017394334077835, + "reward_change_std": 0.32143189013004303, + "reward_std": 0.6953030750155449, + "rewards/cosine_scaled_reward": -0.05100408475846052, + "rewards/format_reward": 0.9791666716337204, + "step": 420 + }, + { + "advantage_max": 1.926459640264511, + "advantage_mean": 2.220446049250313e-16, + "advantage_min": -0.8092224150896072, + "advantage_std": 0.9998452663421631, + "completion_length": 1315.7292137145996, + "epoch": 0.48114285714285715, + "grad_norm": 0.4144383668899536, + "kl": 0.064453125, + "lambda_div_used": 0.6, + "learning_rate": 1.6672287963562852e-07, + "loss": 0.0026, + "reward": 0.2203623978421092, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2203623978421092, + "reward_after_std": 0.8387158066034317, + "reward_before_mean": 0.7068217769265175, + "reward_before_std": 0.7188771069049835, + "reward_change_max": 0.0, + "reward_change_mean": -0.4864593520760536, + "reward_change_min": -0.8118696585297585, + "reward_change_std": 0.30111537501215935, + "reward_std": 0.8387158252298832, + "rewards/cosine_scaled_reward": -0.10492247063666582, + "rewards/format_reward": 0.9166666716337204, + "step": 421 + }, + { + "advantage_max": 1.8332590609788895, + "advantage_mean": 1.2417635808503746e-09, + "advantage_min": -0.9279729872941971, + "advantage_std": 0.999835379421711, + "completion_length": 1939.7500610351562, + "epoch": 0.48228571428571426, + "grad_norm": 0.7433092594146729, + "kl": 0.13888168334960938, + "lambda_div_used": 0.6, + "learning_rate": 1.6508608292777203e-07, + "loss": 0.0056, + "reward": 0.1761605131905526, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.1761605131905526, + "reward_after_std": 0.7036119475960732, + "reward_before_mean": 0.6759131057187915, + "reward_before_std": 0.640310924500227, + "reward_change_max": 0.0007759109139442444, + "reward_change_mean": -0.4997525941580534, + "reward_change_min": -0.8166663274168968, + "reward_change_std": 0.31987714022397995, + "reward_std": 0.7036119699478149, + "rewards/cosine_scaled_reward": -0.04746011132374406, + "rewards/format_reward": 0.7708333488553762, + "step": 422 + }, + { + "advantage_max": 1.9029418379068375, + "advantage_mean": 2.7939678182153926e-08, + "advantage_min": -0.7431788854300976, + "advantage_std": 0.9998583421111107, + "completion_length": 1889.770866394043, + "epoch": 0.48342857142857143, + "grad_norm": 0.4884932339191437, + "kl": 0.11093902587890625, + "lambda_div_used": 0.6, + "learning_rate": 1.6346804638120098e-07, + "loss": 0.0044, + "reward": 0.06853078509448096, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.06853078509448096, + "reward_after_std": 0.8814452588558197, + "reward_before_mean": 0.4631076133809984, + "reward_before_std": 0.8424768298864365, + "reward_change_max": 0.000535815954208374, + "reward_change_mean": -0.39457682403735816, + "reward_change_min": -0.7278733029961586, + "reward_change_std": 0.280312453629449, + "reward_std": 0.88144526258111, + "rewards/cosine_scaled_reward": -0.13302953727543354, + "rewards/format_reward": 0.7291666734963655, + "step": 423 + }, + { + "advantage_max": 1.8147494047880173, + "advantage_mean": 0.0, + "advantage_min": -0.9146339148283005, + "advantage_std": 0.9998487681150436, + "completion_length": 1910.6458740234375, + "epoch": 0.4845714285714286, + "grad_norm": 0.826490581035614, + "kl": 0.11475372314453125, + "lambda_div_used": 0.6, + "learning_rate": 1.6186884885673413e-07, + "loss": 0.0046, + "reward": 0.09315519593656063, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.09315519593656063, + "reward_after_std": 0.744984857738018, + "reward_before_mean": 0.5344462972134352, + "reward_before_std": 0.7094641253352165, + "reward_change_max": 0.0005524009466171265, + "reward_change_mean": -0.44129109010100365, + "reward_change_min": -0.8307461738586426, + "reward_change_std": 0.3057961128652096, + "reward_std": 0.744984857738018, + "rewards/cosine_scaled_reward": -0.08694352209568024, + "rewards/format_reward": 0.7083333432674408, + "step": 424 + }, + { + "advantage_max": 1.8650247901678085, + "advantage_mean": -4.2840839542535036e-08, + "advantage_min": -0.910727221518755, + "advantage_std": 0.9998797848820686, + "completion_length": 1219.7708702087402, + "epoch": 0.4857142857142857, + "grad_norm": 0.44416239857673645, + "kl": 0.05071258544921875, + "lambda_div_used": 0.6, + "learning_rate": 1.6028856829700258e-07, + "loss": 0.002, + "reward": 0.8080948491115123, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.8080948491115123, + "reward_after_std": 0.9254983216524124, + "reward_before_mean": 1.611850606277585, + "reward_before_std": 0.7530386643484235, + "reward_change_max": 7.234513759613037e-06, + "reward_change_mean": -0.8037557974457741, + "reward_change_min": -1.2168358340859413, + "reward_change_std": 0.4892400950193405, + "reward_std": 0.925498329102993, + "rewards/cosine_scaled_reward": 0.3475919794291258, + "rewards/format_reward": 0.9166666716337204, + "step": 425 + }, + { + "advantage_max": 1.7878179401159286, + "advantage_mean": 2.7318796169684134e-08, + "advantage_min": -1.0395925492048264, + "advantage_std": 0.9997829124331474, + "completion_length": 1821.645866394043, + "epoch": 0.4868571428571429, + "grad_norm": 0.6422761678695679, + "kl": 0.17049407958984375, + "lambda_div_used": 0.6, + "learning_rate": 1.5872728172265146e-07, + "loss": 0.0068, + "reward": 0.07293279469013214, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.07293279469013214, + "reward_after_std": 0.5741061456501484, + "reward_before_mean": 0.5401615208247676, + "reward_before_std": 0.5329846888780594, + "reward_change_max": 0.009062647819519043, + "reward_change_mean": -0.46722870878875256, + "reward_change_min": -0.7904520779848099, + "reward_change_std": 0.30266605969518423, + "reward_std": 0.5741061493754387, + "rewards/cosine_scaled_reward": -0.07366926036775112, + "rewards/format_reward": 0.6875000074505806, + "step": 426 + }, + { + "advantage_max": 1.8344089090824127, + "advantage_mean": 1.5987704937714398e-08, + "advantage_min": -0.8490063548088074, + "advantage_std": 0.9998742416501045, + "completion_length": 1730.166732788086, + "epoch": 0.488, + "grad_norm": 0.8043573498725891, + "kl": 0.07287216186523438, + "lambda_div_used": 0.6, + "learning_rate": 1.5718506522858572e-07, + "loss": 0.0029, + "reward": 0.2885350910946727, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2885350910946727, + "reward_after_std": 0.9019887298345566, + "reward_before_mean": 0.8062578393146396, + "reward_before_std": 0.8632199503481388, + "reward_change_max": 0.0, + "reward_change_mean": -0.5177227295935154, + "reward_change_min": -0.9653673470020294, + "reward_change_std": 0.3626882489770651, + "reward_std": 0.9019887447357178, + "rewards/cosine_scaled_reward": 0.017712251748889685, + "rewards/format_reward": 0.7708333358168602, + "step": 427 + }, + { + "advantage_max": 1.918458417057991, + "advantage_mean": 1.2262414250674425e-08, + "advantage_min": -0.7912456393241882, + "advantage_std": 0.9998344704508781, + "completion_length": 1410.6042175292969, + "epoch": 0.48914285714285716, + "grad_norm": 0.46534183621406555, + "kl": 0.03601837158203125, + "lambda_div_used": 0.6, + "learning_rate": 1.5566199398026147e-07, + "loss": 0.0014, + "reward": 0.03753029089421034, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.03753029089421034, + "reward_after_std": 0.734732910990715, + "reward_before_mean": 0.4424444120377302, + "reward_before_std": 0.629564180970192, + "reward_change_max": 0.0, + "reward_change_mean": -0.40491411834955215, + "reward_change_min": -0.6768765859305859, + "reward_change_std": 0.24197638221085072, + "reward_std": 0.7347329407930374, + "rewards/cosine_scaled_reward": -0.21627781761344522, + "rewards/format_reward": 0.8750000111758709, + "step": 428 + }, + { + "advantage_max": 1.879625290632248, + "advantage_mean": -1.6142924996742636e-08, + "advantage_min": -0.8501939885318279, + "advantage_std": 0.9998295605182648, + "completion_length": 955.3958740234375, + "epoch": 0.49028571428571427, + "grad_norm": 0.5915119647979736, + "kl": 0.043731689453125, + "lambda_div_used": 0.6, + "learning_rate": 1.5415814221002265e-07, + "loss": 0.0018, + "reward": 0.22457869758363813, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.22457869758363813, + "reward_after_std": 0.667672373354435, + "reward_before_mean": 0.7487860321998596, + "reward_before_std": 0.5584245286881924, + "reward_change_max": 0.0, + "reward_change_mean": -0.5242073722183704, + "reward_change_min": -0.8736131340265274, + "reward_change_std": 0.30754177272319794, + "reward_std": 0.6676723919808865, + "rewards/cosine_scaled_reward": -0.10477365460246801, + "rewards/format_reward": 0.9583333358168602, + "step": 429 + }, + { + "advantage_max": 1.8657117784023285, + "advantage_mean": -1.5832484767663857e-08, + "advantage_min": -0.9031914100050926, + "advantage_std": 0.9998542368412018, + "completion_length": 1187.3958587646484, + "epoch": 0.49142857142857144, + "grad_norm": 0.532692551612854, + "kl": 0.046665191650390625, + "lambda_div_used": 0.6, + "learning_rate": 1.5267358321348285e-07, + "loss": 0.0019, + "reward": 0.37671698722988367, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.37671698722988367, + "reward_after_std": 0.7594957612454891, + "reward_before_mean": 0.9692848455160856, + "reward_before_std": 0.6301495917141438, + "reward_change_max": 9.178370237350464e-05, + "reward_change_mean": -0.5925678685307503, + "reward_change_min": -0.9060627967119217, + "reward_change_std": 0.3460345584899187, + "reward_std": 0.7594957798719406, + "rewards/cosine_scaled_reward": 0.07839240477187559, + "rewards/format_reward": 0.8125000037252903, + "step": 430 + }, + { + "advantage_max": 1.8353245258331299, + "advantage_mean": -9.934107592091124e-09, + "advantage_min": -0.9341690689325333, + "advantage_std": 0.9998679608106613, + "completion_length": 1518.1875457763672, + "epoch": 0.49257142857142855, + "grad_norm": 1.6024953126907349, + "kl": 0.146270751953125, + "lambda_div_used": 0.6, + "learning_rate": 1.5120838934595337e-07, + "loss": 0.0058, + "reward": 0.1624729260802269, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.1624729260802269, + "reward_after_std": 0.8999154008924961, + "reward_before_mean": 0.6052434705197811, + "reward_before_std": 0.8698120005428791, + "reward_change_max": 0.0008520856499671936, + "reward_change_mean": -0.4427705593407154, + "reward_change_min": -0.7804036028683186, + "reward_change_std": 0.3131858557462692, + "reward_std": 0.8999154083430767, + "rewards/cosine_scaled_reward": -0.08279493264853954, + "rewards/format_reward": 0.7708333432674408, + "step": 431 + }, + { + "advantage_max": 1.9103060811758041, + "advantage_mean": -6.208817238118058e-09, + "advantage_min": -0.835049219429493, + "advantage_std": 0.9998124614357948, + "completion_length": 1616.4792022705078, + "epoch": 0.4937142857142857, + "grad_norm": 0.44959592819213867, + "kl": 0.06036376953125, + "lambda_div_used": 0.6, + "learning_rate": 1.4976263201891613e-07, + "loss": 0.0024, + "reward": 0.005719345761463046, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.005719345761463046, + "reward_after_std": 0.6313600055873394, + "reward_before_mean": 0.4188762828707695, + "reward_before_std": 0.5393286049365997, + "reward_change_max": 0.0, + "reward_change_mean": -0.41315691312775016, + "reward_change_min": -0.6860866919159889, + "reward_change_std": 0.2504292316734791, + "reward_std": 0.6313600204885006, + "rewards/cosine_scaled_reward": -0.19681187812238932, + "rewards/format_reward": 0.8125000149011612, + "step": 432 + }, + { + "advantage_max": 1.892504170536995, + "advantage_mean": 2.980232327587373e-08, + "advantage_min": -0.8732018694281578, + "advantage_std": 0.9998399764299393, + "completion_length": 1564.5000457763672, + "epoch": 0.4948571428571429, + "grad_norm": 0.66018146276474, + "kl": 0.07354354858398438, + "lambda_div_used": 0.6, + "learning_rate": 1.483363816965435e-07, + "loss": 0.0029, + "reward": 0.26046125683933496, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.26046125683933496, + "reward_after_std": 0.711750440299511, + "reward_before_mean": 0.7982939779758453, + "reward_before_std": 0.6088505052030087, + "reward_change_max": 0.0005765184760093689, + "reward_change_mean": -0.5378326866775751, + "reward_change_min": -0.8633565232157707, + "reward_change_std": 0.3263698350638151, + "reward_std": 0.7117504626512527, + "rewards/cosine_scaled_reward": 0.024146972224116325, + "rewards/format_reward": 0.7500000111758709, + "step": 433 + }, + { + "advantage_max": 1.9090900868177414, + "advantage_mean": 0.0, + "advantage_min": -0.7441076934337616, + "advantage_std": 0.9998107478022575, + "completion_length": 1740.4167098999023, + "epoch": 0.496, + "grad_norm": 0.7447723150253296, + "kl": 0.13619613647460938, + "lambda_div_used": 0.6, + "learning_rate": 1.469297078922642e-07, + "loss": 0.0054, + "reward": -0.00020221294835209846, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.00020221294835209846, + "reward_after_std": 0.6341037414968014, + "reward_before_mean": 0.40146185271441936, + "reward_before_std": 0.5198134481906891, + "reward_change_max": 0.0, + "reward_change_mean": -0.4016640540212393, + "reward_change_min": -0.6340990662574768, + "reward_change_std": 0.2353166714310646, + "reward_std": 0.6341037563979626, + "rewards/cosine_scaled_reward": -0.20551909133791924, + "rewards/format_reward": 0.8125000037252903, + "step": 434 + }, + { + "advantage_max": 1.9161070734262466, + "advantage_mean": -1.707424759911369e-08, + "advantage_min": -0.8011736907064915, + "advantage_std": 0.9998152628540993, + "completion_length": 1104.5833778381348, + "epoch": 0.49714285714285716, + "grad_norm": 0.6831105947494507, + "kl": 0.09851837158203125, + "lambda_div_used": 0.6, + "learning_rate": 1.4554267916537495e-07, + "loss": 0.0039, + "reward": 0.1984256466384977, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.1984256466384977, + "reward_after_std": 0.6855190061032772, + "reward_before_mean": 0.701103962957859, + "reward_before_std": 0.5424638222903013, + "reward_change_max": 0.0, + "reward_change_mean": -0.5026783309876919, + "reward_change_min": -0.7911634966731071, + "reward_change_std": 0.28394212760031223, + "reward_std": 0.6855190135538578, + "rewards/cosine_scaled_reward": -0.11819804133847356, + "rewards/format_reward": 0.9375000149011612, + "step": 435 + }, + { + "advantage_max": 1.8080298602581024, + "advantage_mean": -1.9247333504779363e-08, + "advantage_min": -1.0045333839952946, + "advantage_std": 0.9998283982276917, + "completion_length": 1415.3958740234375, + "epoch": 0.4982857142857143, + "grad_norm": 0.5908387303352356, + "kl": 0.10746002197265625, + "lambda_div_used": 0.6, + "learning_rate": 1.4417536311769885e-07, + "loss": 0.0043, + "reward": 0.40853736363351345, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.40853736363351345, + "reward_after_std": 0.6946579478681087, + "reward_before_mean": 1.03979467228055, + "reward_before_std": 0.596217917278409, + "reward_change_max": 0.0012014508247375488, + "reward_change_mean": -0.6312573403120041, + "reward_change_min": -0.9797411002218723, + "reward_change_std": 0.388340774923563, + "reward_std": 0.6946579590439796, + "rewards/cosine_scaled_reward": 0.12406398542225361, + "rewards/format_reward": 0.7916666716337204, + "step": 436 + }, + { + "advantage_max": 1.765413984656334, + "advantage_mean": -1.4280280069556284e-08, + "advantage_min": -1.140317179262638, + "advantage_std": 0.9998202100396156, + "completion_length": 1342.8125228881836, + "epoch": 0.49942857142857144, + "grad_norm": 0.8170318603515625, + "kl": 0.040370941162109375, + "lambda_div_used": 0.6, + "learning_rate": 1.4282782639029128e-07, + "loss": 0.0016, + "reward": 0.2664998557884246, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2664998557884246, + "reward_after_std": 0.6159840114414692, + "reward_before_mean": 0.8322788365185261, + "reward_before_std": 0.576800886541605, + "reward_change_max": 0.0, + "reward_change_mean": -0.5657790005207062, + "reward_change_min": -0.9002040028572083, + "reward_change_std": 0.34919700771570206, + "reward_std": 0.6159840300679207, + "rewards/cosine_scaled_reward": -0.06302724592387676, + "rewards/format_reward": 0.9583333432674408, + "step": 437 + }, + { + "advantage_max": 1.8181038945913315, + "advantage_mean": -8.692344288796505e-09, + "advantage_min": -0.9430951476097107, + "advantage_std": 0.9998539239168167, + "completion_length": 1953.8750762939453, + "epoch": 0.5005714285714286, + "grad_norm": 1.0785415172576904, + "kl": 0.2301177978515625, + "lambda_div_used": 0.6, + "learning_rate": 1.4150013466019114e-07, + "loss": 0.0092, + "reward": 0.12155490834265947, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.12155490834265947, + "reward_after_std": 0.8553712926805019, + "reward_before_mean": 0.5592961621587165, + "reward_before_std": 0.8434982839971781, + "reward_change_max": 0.0007446259260177612, + "reward_change_mean": -0.4377412758767605, + "reward_change_min": -0.891018021851778, + "reward_change_std": 0.33926537446677685, + "reward_std": 0.8553713001310825, + "rewards/cosine_scaled_reward": -0.09535192046314478, + "rewards/format_reward": 0.7500000186264515, + "step": 438 + }, + { + "advantage_max": 1.7687881886959076, + "advantage_mean": -6.208817904251873e-10, + "advantage_min": -1.0331488847732544, + "advantage_std": 0.9998689517378807, + "completion_length": 1621.4375762939453, + "epoch": 0.5017142857142857, + "grad_norm": 1.3315192461013794, + "kl": 0.14151763916015625, + "lambda_div_used": 0.6, + "learning_rate": 1.4019235263722034e-07, + "loss": 0.0057, + "reward": 0.04851225670427084, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.04851225670427084, + "reward_after_std": 0.8845059871673584, + "reward_before_mean": 0.4421884883195162, + "reward_before_std": 0.9212424159049988, + "reward_change_max": 0.0003846213221549988, + "reward_change_mean": -0.3936762325465679, + "reward_change_min": -0.8846784085035324, + "reward_change_std": 0.34067676588892937, + "reward_std": 0.8845060132443905, + "rewards/cosine_scaled_reward": -0.11223910190165043, + "rewards/format_reward": 0.6666666828095913, + "step": 439 + }, + { + "advantage_max": 1.8706139773130417, + "advantage_mean": 1.1796753074388988e-08, + "advantage_min": -0.8981772735714912, + "advantage_std": 0.9998108372092247, + "completion_length": 1569.7292251586914, + "epoch": 0.5028571428571429, + "grad_norm": 0.855440080165863, + "kl": 0.1434173583984375, + "lambda_div_used": 0.6, + "learning_rate": 1.3890454406082956e-07, + "loss": 0.0057, + "reward": -0.05658835871145129, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.05658835871145129, + "reward_after_std": 0.5816938430070877, + "reward_before_mean": 0.33301460556685925, + "reward_before_std": 0.5139974765479565, + "reward_change_max": 0.0, + "reward_change_mean": -0.3896029610186815, + "reward_change_min": -0.681924182921648, + "reward_change_std": 0.24834264814853668, + "reward_std": 0.5816938765347004, + "rewards/cosine_scaled_reward": -0.1980760432779789, + "rewards/format_reward": 0.729166679084301, + "step": 440 + }, + { + "advantage_max": 1.8911541998386383, + "advantage_mean": 1.4280279625467074e-08, + "advantage_min": -0.8213992044329643, + "advantage_std": 0.9998567774891853, + "completion_length": 1801.3750267028809, + "epoch": 0.504, + "grad_norm": 0.8155668377876282, + "kl": 0.165771484375, + "lambda_div_used": 0.6, + "learning_rate": 1.3763677169699217e-07, + "loss": 0.0066, + "reward": 0.07445507869124413, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07445507869124413, + "reward_after_std": 0.7889985404908657, + "reward_before_mean": 0.4961062502115965, + "reward_before_std": 0.743747316300869, + "reward_change_max": 0.0009514167904853821, + "reward_change_mean": -0.42165115289390087, + "reward_change_min": -0.721052560955286, + "reward_change_std": 0.29154213331639767, + "reward_std": 0.7889985628426075, + "rewards/cosine_scaled_reward": -0.09569689631462097, + "rewards/format_reward": 0.6875000093132257, + "step": 441 + }, + { + "advantage_max": 1.9226910918951035, + "advantage_mean": -2.8560559028889543e-08, + "advantage_min": -0.745913602411747, + "advantage_std": 0.9998499751091003, + "completion_length": 1091.2500228881836, + "epoch": 0.5051428571428571, + "grad_norm": 0.7159312963485718, + "kl": 0.05721282958984375, + "lambda_div_used": 0.6, + "learning_rate": 1.3638909733514452e-07, + "loss": 0.0023, + "reward": 0.41800220077857375, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.41800220077857375, + "reward_after_std": 0.7833468951284885, + "reward_before_mean": 1.0278632938861847, + "reward_before_std": 0.6174677358940244, + "reward_change_max": 0.0, + "reward_change_mean": -0.6098610982298851, + "reward_change_min": -0.9643383026123047, + "reward_change_std": 0.35795337706804276, + "reward_std": 0.7833469174802303, + "rewards/cosine_scaled_reward": 0.08684830274432898, + "rewards/format_reward": 0.854166679084301, + "step": 442 + }, + { + "advantage_max": 1.8147583454847336, + "advantage_mean": 6.208817349140361e-09, + "advantage_min": -1.0209823548793793, + "advantage_std": 0.9998364895582199, + "completion_length": 1822.1459045410156, + "epoch": 0.5062857142857143, + "grad_norm": 0.8386198282241821, + "kl": 0.11566162109375, + "lambda_div_used": 0.6, + "learning_rate": 1.351615817851748e-07, + "loss": 0.0046, + "reward": 0.030754336155951023, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.030754336155951023, + "reward_after_std": 0.7065671682357788, + "reward_before_mean": 0.4468862364301458, + "reward_before_std": 0.6671516671776772, + "reward_change_max": 0.0, + "reward_change_mean": -0.4161319136619568, + "reward_change_min": -0.7626941949129105, + "reward_change_std": 0.2861163951456547, + "reward_std": 0.7065671719610691, + "rewards/cosine_scaled_reward": -0.13072354905307293, + "rewards/format_reward": 0.7083333507180214, + "step": 443 + }, + { + "advantage_max": 1.9129046946763992, + "advantage_mean": -9.313226190243995e-09, + "advantage_min": -0.8238680362701416, + "advantage_std": 0.9998748078942299, + "completion_length": 1545.3750267028809, + "epoch": 0.5074285714285715, + "grad_norm": 1.1218396425247192, + "kl": 0.19435882568359375, + "lambda_div_used": 0.6, + "learning_rate": 1.3395428487445914e-07, + "loss": 0.0078, + "reward": 0.03131588757969439, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.03131588757969439, + "reward_after_std": 0.9942633770406246, + "reward_before_mean": 0.37214701221091673, + "reward_before_std": 0.9374651424586773, + "reward_change_max": 0.00407920777797699, + "reward_change_mean": -0.3408311549574137, + "reward_change_min": -0.6419351659715176, + "reward_change_std": 0.25083165243268013, + "reward_std": 0.9942633770406246, + "rewards/cosine_scaled_reward": -0.1576764981728047, + "rewards/format_reward": 0.6875000111758709, + "step": 444 + }, + { + "advantage_max": 1.858717828989029, + "advantage_mean": -1.5522043095295146e-08, + "advantage_min": -0.842341735959053, + "advantage_std": 0.9998549446463585, + "completion_length": 1924.5000762939453, + "epoch": 0.5085714285714286, + "grad_norm": 0.7524690628051758, + "kl": 0.251220703125, + "lambda_div_used": 0.6, + "learning_rate": 1.3276726544494571e-07, + "loss": 0.01, + "reward": 0.0804061135277152, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.0804061135277152, + "reward_after_std": 0.7893403545022011, + "reward_before_mean": 0.4991146745160222, + "reward_before_std": 0.7196450419723988, + "reward_change_max": 0.0, + "reward_change_mean": -0.41870855912566185, + "reward_change_min": -0.7721864506602287, + "reward_change_std": 0.27942358888685703, + "reward_std": 0.7893403694033623, + "rewards/cosine_scaled_reward": -0.1462760092690587, + "rewards/format_reward": 0.791666679084301, + "step": 445 + }, + { + "advantage_max": 1.9125205725431442, + "advantage_mean": -1.521160272743849e-08, + "advantage_min": -0.7717568129301071, + "advantage_std": 0.9998368695378304, + "completion_length": 1579.791748046875, + "epoch": 0.5097142857142857, + "grad_norm": 0.6899632215499878, + "kl": 0.10594940185546875, + "lambda_div_used": 0.6, + "learning_rate": 1.316005813502869e-07, + "loss": 0.0042, + "reward": 0.3102727495133877, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3102727495133877, + "reward_after_std": 0.8228458315134048, + "reward_before_mean": 0.8454371113330126, + "reward_before_std": 0.6917162649333477, + "reward_change_max": 0.0012743175029754639, + "reward_change_mean": -0.5351644065231085, + "reward_change_min": -0.9024721682071686, + "reward_change_std": 0.3305331449955702, + "reward_std": 0.8228458426892757, + "rewards/cosine_scaled_reward": 0.0685519129037857, + "rewards/format_reward": 0.7083333414047956, + "step": 446 + }, + { + "advantage_max": 1.8417370170354843, + "advantage_mean": -6.829698695476338e-09, + "advantage_min": -0.935723327100277, + "advantage_std": 0.9998374506831169, + "completion_length": 1487.7917022705078, + "epoch": 0.5108571428571429, + "grad_norm": 0.8426020741462708, + "kl": 0.1772003173828125, + "lambda_div_used": 0.6, + "learning_rate": 1.3045428945301953e-07, + "loss": 0.0071, + "reward": 0.23191683134064078, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.23191683134064078, + "reward_after_std": 0.7033054195344448, + "reward_before_mean": 0.7608326952904463, + "reward_before_std": 0.6380577906966209, + "reward_change_max": 0.0, + "reward_change_mean": -0.5289158523082733, + "reward_change_min": -0.8764414638280869, + "reward_change_std": 0.3322218470275402, + "reward_std": 0.7033054418861866, + "rewards/cosine_scaled_reward": -0.05708366571343504, + "rewards/format_reward": 0.8750000149011612, + "step": 447 + }, + { + "advantage_max": 1.902267187833786, + "advantage_mean": 6.208816516473092e-10, + "advantage_min": -0.8541913852095604, + "advantage_std": 0.9997800961136818, + "completion_length": 1220.833366394043, + "epoch": 0.512, + "grad_norm": 0.9086900949478149, + "kl": 0.12390899658203125, + "lambda_div_used": 0.6, + "learning_rate": 1.2932844562179352e-07, + "loss": 0.0049, + "reward": 0.2925412461627275, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.2925412461627275, + "reward_after_std": 0.5348220467567444, + "reward_before_mean": 0.8818653598427773, + "reward_before_std": 0.3927401080727577, + "reward_change_max": 0.0, + "reward_change_mean": -0.5893240831792355, + "reward_change_min": -0.8397349342703819, + "reward_change_std": 0.32815040461719036, + "reward_std": 0.5348220616579056, + "rewards/cosine_scaled_reward": -0.017400696873664856, + "rewards/format_reward": 0.916666679084301, + "step": 448 + }, + { + "advantage_max": 1.905688613653183, + "advantage_mean": -2.23517424569053e-08, + "advantage_min": -0.841227937489748, + "advantage_std": 0.9997895136475563, + "completion_length": 1415.9167137145996, + "epoch": 0.5131428571428571, + "grad_norm": 0.8157268166542053, + "kl": 0.20606422424316406, + "lambda_div_used": 0.6, + "learning_rate": 1.2822310472864885e-07, + "loss": 0.0082, + "reward": 0.01664125733077526, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.01664125733077526, + "reward_after_std": 0.5663458779454231, + "reward_before_mean": 0.4466824699193239, + "reward_before_std": 0.44439556263387203, + "reward_change_max": 0.000407978892326355, + "reward_change_mean": -0.4300412107259035, + "reward_change_min": -0.6489026509225368, + "reward_change_std": 0.24280665069818497, + "reward_std": 0.5663458835333586, + "rewards/cosine_scaled_reward": -0.16207545064389706, + "rewards/format_reward": 0.7708333432674408, + "step": 449 + }, + { + "advantage_max": 1.8794930279254913, + "advantage_mean": 1.117587122845265e-08, + "advantage_min": -0.8497578613460064, + "advantage_std": 0.9998070001602173, + "completion_length": 1445.083366394043, + "epoch": 0.5142857142857142, + "grad_norm": 0.7874720692634583, + "kl": 0.15003204345703125, + "lambda_div_used": 0.6, + "learning_rate": 1.2713832064634125e-07, + "loss": 0.006, + "reward": 0.02464776113629341, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.02464776113629341, + "reward_after_std": 0.5693525820970535, + "reward_before_mean": 0.45805526059120893, + "reward_before_std": 0.4447983056306839, + "reward_change_max": 0.0023548901081085205, + "reward_change_mean": -0.43340749107301235, + "reward_change_min": -0.6489949226379395, + "reward_change_std": 0.2607662323862314, + "reward_std": 0.5693525932729244, + "rewards/cosine_scaled_reward": -0.12513904832303524, + "rewards/format_reward": 0.7083333395421505, + "step": 450 + }, + { + "advantage_max": 1.872328832745552, + "advantage_mean": -1.6763806898190126e-08, + "advantage_min": -0.8258812800049782, + "advantage_std": 0.9998480677604675, + "completion_length": 1233.708366394043, + "epoch": 0.5154285714285715, + "grad_norm": 0.7528899908065796, + "kl": 0.09528350830078125, + "lambda_div_used": 0.6, + "learning_rate": 1.260741462457165e-07, + "loss": 0.0038, + "reward": 0.3256368708098307, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.3256368708098307, + "reward_after_std": 0.7790644764900208, + "reward_before_mean": 0.8877322040498257, + "reward_before_std": 0.6680051982402802, + "reward_change_max": 0.0, + "reward_change_mean": -0.562095332890749, + "reward_change_min": -0.9134536460042, + "reward_change_std": 0.35085158981382847, + "reward_std": 0.7790645137429237, + "rewards/cosine_scaled_reward": -0.00405057892203331, + "rewards/format_reward": 0.8958333432674408, + "step": 451 + }, + { + "advantage_max": 1.9041462689638138, + "advantage_mean": 1.0865430555284661e-08, + "advantage_min": -0.9039032459259033, + "advantage_std": 0.9998711198568344, + "completion_length": 1906.7292098999023, + "epoch": 0.5165714285714286, + "grad_norm": 1.088655710220337, + "kl": 0.24600601196289062, + "lambda_div_used": 0.6, + "learning_rate": 1.2503063339313356e-07, + "loss": 0.0098, + "reward": 0.29726181272417307, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.29726181272417307, + "reward_after_std": 0.8609438054263592, + "reward_before_mean": 0.8219169015064836, + "reward_before_std": 0.7409865632653236, + "reward_change_max": 0.0, + "reward_change_mean": -0.5246550729498267, + "reward_change_min": -0.8709829784929752, + "reward_change_std": 0.33557169884443283, + "reward_std": 0.8609438389539719, + "rewards/cosine_scaled_reward": 0.07762509072199464, + "rewards/format_reward": 0.6666666716337204, + "step": 452 + }, + { + "advantage_max": 1.86186121404171, + "advantage_mean": 7.1401394796666295e-09, + "advantage_min": -0.8760163262486458, + "advantage_std": 0.9998539313673973, + "completion_length": 1587.083396911621, + "epoch": 0.5177142857142857, + "grad_norm": 0.6533041596412659, + "kl": 0.12093353271484375, + "lambda_div_used": 0.6, + "learning_rate": 1.2400783294793668e-07, + "loss": 0.0048, + "reward": 0.36189063219353557, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.36189063219353557, + "reward_after_std": 0.7707811817526817, + "reward_before_mean": 0.9481308050453663, + "reward_before_std": 0.6685521900653839, + "reward_change_max": 0.0, + "reward_change_mean": -0.58624017983675, + "reward_change_min": -0.9146820791065693, + "reward_change_std": 0.3575539728626609, + "reward_std": 0.7707811929285526, + "rewards/cosine_scaled_reward": 0.036565386690199375, + "rewards/format_reward": 0.8750000055879354, + "step": 453 + }, + { + "advantage_max": 1.799769252538681, + "advantage_mean": 0.0, + "advantage_min": -0.9815754368901253, + "advantage_std": 0.9998062923550606, + "completion_length": 1350.0416946411133, + "epoch": 0.5188571428571429, + "grad_norm": 0.8042285442352295, + "kl": 0.10607337951660156, + "lambda_div_used": 0.6, + "learning_rate": 1.2300579475997657e-07, + "loss": 0.0042, + "reward": 0.11526673112530261, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.11526673112530261, + "reward_after_std": 0.6661871522665024, + "reward_before_mean": 0.5921627879142761, + "reward_before_std": 0.6296087387017906, + "reward_change_max": 0.002816028892993927, + "reward_change_mean": -0.4768960699439049, + "reward_change_min": -0.8067386299371719, + "reward_change_std": 0.33680446818470955, + "reward_std": 0.6661871746182442, + "rewards/cosine_scaled_reward": -0.11016861326061189, + "rewards/format_reward": 0.8125000111758709, + "step": 454 + }, + { + "advantage_max": 1.9014418870210648, + "advantage_mean": 7.450580929990736e-09, + "advantage_min": -0.8852047994732857, + "advantage_std": 0.9998085275292397, + "completion_length": 1282.6250610351562, + "epoch": 0.52, + "grad_norm": 0.4326711893081665, + "kl": 0.11649322509765625, + "lambda_div_used": 0.6, + "learning_rate": 1.220245676671809e-07, + "loss": 0.0047, + "reward": 0.08766961051151156, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.08766961051151156, + "reward_after_std": 0.5684734508395195, + "reward_before_mean": 0.5595664214342833, + "reward_before_std": 0.46307386830449104, + "reward_change_max": 0.0, + "reward_change_mean": -0.47189680114388466, + "reward_change_min": -0.7259544879198074, + "reward_change_std": 0.2745931725949049, + "reward_std": 0.5684734769165516, + "rewards/cosine_scaled_reward": -0.17855014093220234, + "rewards/format_reward": 0.9166666679084301, + "step": 455 + }, + { + "advantage_max": 1.8154648691415787, + "advantage_mean": 2.1730860444435507e-09, + "advantage_min": -0.9932011067867279, + "advantage_std": 0.9998342245817184, + "completion_length": 1712.9584159851074, + "epoch": 0.5211428571428571, + "grad_norm": 1.1537747383117676, + "kl": 0.2523918151855469, + "lambda_div_used": 0.6, + "learning_rate": 1.2106419949317388e-07, + "loss": 0.0101, + "reward": 0.06782927364110947, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.06782927364110947, + "reward_after_std": 0.7310365661978722, + "reward_before_mean": 0.4991466961801052, + "reward_before_std": 0.7056907415390015, + "reward_change_max": 0.0028716325759887695, + "reward_change_mean": -0.43131742626428604, + "reward_change_min": -0.7868289947509766, + "reward_change_std": 0.3076404817402363, + "reward_std": 0.731036588549614, + "rewards/cosine_scaled_reward": -0.10459333215840161, + "rewards/format_reward": 0.7083333507180214, + "step": 456 + }, + { + "advantage_max": 1.8463391065597534, + "advantage_mean": 2.980232327587373e-08, + "advantage_min": -0.9132648408412933, + "advantage_std": 0.9998122155666351, + "completion_length": 1766.3750305175781, + "epoch": 0.5222857142857142, + "grad_norm": 0.8214160203933716, + "kl": 0.2835235595703125, + "lambda_div_used": 0.6, + "learning_rate": 1.2012473704494537e-07, + "loss": 0.0113, + "reward": 0.053376014227978885, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.053376014227978885, + "reward_after_std": 0.6058420166373253, + "reward_before_mean": 0.49396978225558996, + "reward_before_std": 0.5151708796620369, + "reward_change_max": 0.0011105164885520935, + "reward_change_mean": -0.4405937194824219, + "reward_change_min": -0.7060803063213825, + "reward_change_std": 0.268763717263937, + "reward_std": 0.6058420278131962, + "rewards/cosine_scaled_reward": -0.09676513634622097, + "rewards/format_reward": 0.6875000149011612, + "step": 457 + }, + { + "advantage_max": 1.858269363641739, + "advantage_mean": 4.190951752303107e-09, + "advantage_min": -0.9696895703673363, + "advantage_std": 0.9998143240809441, + "completion_length": 1340.2292098999023, + "epoch": 0.5234285714285715, + "grad_norm": 0.8421010971069336, + "kl": 0.12511825561523438, + "lambda_div_used": 0.6, + "learning_rate": 1.1920622611056974e-07, + "loss": 0.005, + "reward": 0.040771787986159325, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.040771787986159325, + "reward_after_std": 0.6010152883827686, + "reward_before_mean": 0.48337520100176334, + "reward_before_std": 0.5329945832490921, + "reward_change_max": 0.0, + "reward_change_mean": -0.442603413015604, + "reward_change_min": -0.7114094980061054, + "reward_change_std": 0.2769723404198885, + "reward_std": 0.6010153144598007, + "rewards/cosine_scaled_reward": -0.14372907672077417, + "rewards/format_reward": 0.770833345130086, + "step": 458 + }, + { + "advantage_max": 1.8294371962547302, + "advantage_mean": 1.8626454822978644e-09, + "advantage_min": -1.01992367208004, + "advantage_std": 0.9998571202158928, + "completion_length": 1208.854206085205, + "epoch": 0.5245714285714286, + "grad_norm": 0.8173410892486572, + "kl": 0.0986328125, + "lambda_div_used": 0.6, + "learning_rate": 1.1830871145697412e-07, + "loss": 0.0039, + "reward": 0.2616848908364773, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2616848908364773, + "reward_after_std": 0.8070877604186535, + "reward_before_mean": 0.7860768726095557, + "reward_before_std": 0.7491856962442398, + "reward_change_max": 0.00043958425521850586, + "reward_change_mean": -0.5243919845670462, + "reward_change_min": -0.9147582352161407, + "reward_change_std": 0.3446339722722769, + "reward_std": 0.8070877604186535, + "rewards/cosine_scaled_reward": -0.03404490277171135, + "rewards/format_reward": 0.854166679084301, + "step": 459 + }, + { + "advantage_max": 1.7802351862192154, + "advantage_mean": 1.117587172805301e-08, + "advantage_min": -0.9591168016195297, + "advantage_std": 0.9998372122645378, + "completion_length": 1933.541748046875, + "epoch": 0.5257142857142857, + "grad_norm": 1.4948370456695557, + "kl": 0.232696533203125, + "lambda_div_used": 0.6, + "learning_rate": 1.1743223682775649e-07, + "loss": 0.0093, + "reward": 0.17830261262133718, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.17830261262133718, + "reward_after_std": 0.766702301800251, + "reward_before_mean": 0.669447798281908, + "reward_before_std": 0.7442214302718639, + "reward_change_max": 0.0005079954862594604, + "reward_change_mean": -0.49114519357681274, + "reward_change_min": -0.9560124427080154, + "reward_change_std": 0.3472681976854801, + "reward_std": 0.7667023204267025, + "rewards/cosine_scaled_reward": -0.050692775286734104, + "rewards/format_reward": 0.7708333488553762, + "step": 460 + }, + { + "advantage_max": 1.8203076124191284, + "advantage_mean": 3.725290520506519e-09, + "advantage_min": -0.9270096272230148, + "advantage_std": 0.9998652040958405, + "completion_length": 1526.2917098999023, + "epoch": 0.5268571428571428, + "grad_norm": 0.8616439700126648, + "kl": 0.14084625244140625, + "lambda_div_used": 0.6, + "learning_rate": 1.1657684494105386e-07, + "loss": 0.0056, + "reward": 0.27595112178096315, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.27595112178096315, + "reward_after_std": 0.9150453321635723, + "reward_before_mean": 0.7884750002995133, + "reward_before_std": 0.9043990522623062, + "reward_change_max": 0.0, + "reward_change_mean": -0.5125238858163357, + "reward_change_min": -0.9507169425487518, + "reward_change_std": 0.3724978230893612, + "reward_std": 0.9150453470647335, + "rewards/cosine_scaled_reward": 0.04007083596661687, + "rewards/format_reward": 0.7083333432674408, + "step": 461 + }, + { + "advantage_max": 1.8993992805480957, + "advantage_mean": 6.208817682207268e-09, + "advantage_min": -0.8489513024687767, + "advantage_std": 0.9998081251978874, + "completion_length": 1436.9375228881836, + "epoch": 0.528, + "grad_norm": 1.4473426342010498, + "kl": 0.20296478271484375, + "lambda_div_used": 0.6, + "learning_rate": 1.1574257748745986e-07, + "loss": 0.0081, + "reward": -0.10620388202369213, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.10620388202369213, + "reward_after_std": 0.6253377795219421, + "reward_before_mean": 0.24274032982066274, + "reward_before_std": 0.5555364396423101, + "reward_change_max": 8.26418399810791e-05, + "reward_change_mean": -0.34894421696662903, + "reward_change_min": -0.612749133259058, + "reward_change_std": 0.24038436450064182, + "reward_std": 0.625337790697813, + "rewards/cosine_scaled_reward": -0.21196317533031106, + "rewards/format_reward": 0.6666666716337204, + "step": 462 + }, + { + "advantage_max": 1.7989549040794373, + "advantage_mean": -6.6744786364481e-09, + "advantage_min": -0.9759308695793152, + "advantage_std": 0.9998603165149689, + "completion_length": 1856.2292022705078, + "epoch": 0.5291428571428571, + "grad_norm": 1.4682304859161377, + "kl": 0.24060440063476562, + "lambda_div_used": 0.6, + "learning_rate": 1.1492947512799328e-07, + "loss": 0.0096, + "reward": 0.2849512416869402, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2849512416869402, + "reward_after_std": 0.801155474036932, + "reward_before_mean": 0.8258302640169859, + "reward_before_std": 0.7735779173672199, + "reward_change_max": 0.0022415444254875183, + "reward_change_mean": -0.5408790297806263, + "reward_change_min": -0.9627332501113415, + "reward_change_std": 0.37879977002739906, + "reward_std": 0.8011555224657059, + "rewards/cosine_scaled_reward": 0.10041513899341226, + "rewards/format_reward": 0.6250000093132257, + "step": 463 + }, + { + "advantage_max": 1.870115026831627, + "advantage_mean": -3.756334376880943e-08, + "advantage_min": -0.872229591012001, + "advantage_std": 0.9998425468802452, + "completion_length": 1322.395866394043, + "epoch": 0.5302857142857142, + "grad_norm": 0.9846634268760681, + "kl": 0.19464111328125, + "lambda_div_used": 0.6, + "learning_rate": 1.1413757749211602e-07, + "loss": 0.0078, + "reward": 0.3826366728171706, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3826366728171706, + "reward_after_std": 0.6611596159636974, + "reward_before_mean": 1.0033353762701154, + "reward_before_std": 0.5336503200232983, + "reward_change_max": 0.0006307139992713928, + "reward_change_mean": -0.6206987667828798, + "reward_change_min": -0.9831653386354446, + "reward_change_std": 0.36680967546999454, + "reward_std": 0.6611596457660198, + "rewards/cosine_scaled_reward": 0.1162510234862566, + "rewards/format_reward": 0.7708333432674408, + "step": 464 + }, + { + "advantage_max": 1.9198713898658752, + "advantage_mean": -3.104408563547878e-09, + "advantage_min": -0.7656259946525097, + "advantage_std": 0.9998870715498924, + "completion_length": 1667.8542213439941, + "epoch": 0.5314285714285715, + "grad_norm": 1.1186479330062866, + "kl": 0.3111457824707031, + "lambda_div_used": 0.6, + "learning_rate": 1.1336692317580158e-07, + "loss": 0.0124, + "reward": 0.4131432604044676, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.4131432604044676, + "reward_after_std": 1.006252758204937, + "reward_before_mean": 0.9700895072892308, + "reward_before_std": 0.8827327527105808, + "reward_change_max": 0.0, + "reward_change_mean": -0.5569462459534407, + "reward_change_min": -0.9653652012348175, + "reward_change_std": 0.3402498383074999, + "reward_std": 1.0062527880072594, + "rewards/cosine_scaled_reward": 0.037128068739548326, + "rewards/format_reward": 0.8958333395421505, + "step": 465 + }, + { + "advantage_max": 1.8933946043252945, + "advantage_mean": 9.93410786964688e-09, + "advantage_min": -0.8079804182052612, + "advantage_std": 0.9998544976115227, + "completion_length": 1693.1875534057617, + "epoch": 0.5325714285714286, + "grad_norm": 1.2470462322235107, + "kl": 0.2385406494140625, + "lambda_div_used": 0.6, + "learning_rate": 1.1261754973965422e-07, + "loss": 0.0095, + "reward": 0.16381147410720587, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.16381147410720587, + "reward_after_std": 0.8152791000902653, + "reward_before_mean": 0.621258151717484, + "reward_before_std": 0.7254021167755127, + "reward_change_max": 0.000746704638004303, + "reward_change_mean": -0.4574466794729233, + "reward_change_min": -0.8183608688414097, + "reward_change_std": 0.31307646073400974, + "reward_std": 0.8152791261672974, + "rewards/cosine_scaled_reward": -0.001870934385806322, + "rewards/format_reward": 0.6250000093132257, + "step": 466 + }, + { + "advantage_max": 1.8866389095783234, + "advantage_mean": 7.450581429591097e-09, + "advantage_min": -0.825259268283844, + "advantage_std": 0.9998439028859138, + "completion_length": 1645.6875267028809, + "epoch": 0.5337142857142857, + "grad_norm": 1.0692403316497803, + "kl": 0.2185211181640625, + "lambda_div_used": 0.6, + "learning_rate": 1.1188949370707787e-07, + "loss": 0.0088, + "reward": 0.14308831095695496, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.14308831095695496, + "reward_after_std": 0.8592479415237904, + "reward_before_mean": 0.5868221651762724, + "reward_before_std": 0.7930984199047089, + "reward_change_max": 0.0006082803010940552, + "reward_change_mean": -0.4437338560819626, + "reward_change_min": -0.8282331451773643, + "reward_change_std": 0.3037046445533633, + "reward_std": 0.8592479638755322, + "rewards/cosine_scaled_reward": -0.09200559067539871, + "rewards/format_reward": 0.7708333507180214, + "step": 467 + }, + { + "advantage_max": 1.841501995921135, + "advantage_mean": 7.450580818968433e-09, + "advantage_min": -0.9024374559521675, + "advantage_std": 0.9998460412025452, + "completion_length": 1776.7500610351562, + "epoch": 0.5348571428571428, + "grad_norm": 1.1234499216079712, + "kl": 0.2584228515625, + "lambda_div_used": 0.6, + "learning_rate": 1.1118279056249653e-07, + "loss": 0.0103, + "reward": 0.05111578106880188, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.05111578106880188, + "reward_after_std": 0.7923570424318314, + "reward_before_mean": 0.45699799386784434, + "reward_before_std": 0.7673504799604416, + "reward_change_max": 0.0007030144333839417, + "reward_change_mean": -0.40588223095983267, + "reward_change_min": -0.8021054789423943, + "reward_change_std": 0.2961500799283385, + "reward_std": 0.792357049882412, + "rewards/cosine_scaled_reward": -0.1048343344591558, + "rewards/format_reward": 0.6666666753590107, + "step": 468 + }, + { + "advantage_max": 1.872466504573822, + "advantage_mean": -1.5522042651205936e-08, + "advantage_min": -0.9310917481780052, + "advantage_std": 0.9998057186603546, + "completion_length": 1483.645866394043, + "epoch": 0.536, + "grad_norm": 0.9675341844558716, + "kl": 0.2593994140625, + "lambda_div_used": 0.6, + "learning_rate": 1.1049747474962444e-07, + "loss": 0.0104, + "reward": 0.04530154122039676, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.04530154122039676, + "reward_after_std": 0.546064380556345, + "reward_before_mean": 0.49883297085762024, + "reward_before_std": 0.4377680625766516, + "reward_change_max": 0.0004517883062362671, + "reward_change_mean": -0.4535314626991749, + "reward_change_min": -0.6864179968833923, + "reward_change_std": 0.2757315170019865, + "reward_std": 0.5460643954575062, + "rewards/cosine_scaled_reward": -0.11516685970127583, + "rewards/format_reward": 0.7291666772216558, + "step": 469 + }, + { + "advantage_max": 1.7256519347429276, + "advantage_mean": 3.073364596151151e-08, + "advantage_min": -0.8964690193533897, + "advantage_std": 0.9998262897133827, + "completion_length": 2194.7708740234375, + "epoch": 0.5371428571428571, + "grad_norm": 2.29399037361145, + "kl": 0.50030517578125, + "lambda_div_used": 0.6, + "learning_rate": 1.0983357966978745e-07, + "loss": 0.02, + "reward": -0.03671526629477739, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": -0.03671526629477739, + "reward_after_std": 0.7010341510176659, + "reward_before_mean": 0.3439774438738823, + "reward_before_std": 0.7186840288341045, + "reward_change_max": 0.0023821890354156494, + "reward_change_mean": -0.3806927101686597, + "reward_change_min": -0.7756905145943165, + "reward_change_std": 0.30107271298766136, + "reward_std": 0.7010341733694077, + "rewards/cosine_scaled_reward": -0.13009461481124163, + "rewards/format_reward": 0.6041666679084301, + "step": 470 + }, + { + "advantage_max": 1.7769792973995209, + "advantage_mean": 9.31322685637781e-10, + "advantage_min": -1.0818905234336853, + "advantage_std": 0.9998090341687202, + "completion_length": 2019.2500457763672, + "epoch": 0.5382857142857143, + "grad_norm": 1.6096271276474, + "kl": 0.419647216796875, + "lambda_div_used": 0.6, + "learning_rate": 1.0919113768029517e-07, + "loss": 0.0168, + "reward": 0.12106739962473512, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.12106739962473512, + "reward_after_std": 0.6331440471112728, + "reward_before_mean": 0.6041710805147886, + "reward_before_std": 0.61738595739007, + "reward_change_max": 0.0002641230821609497, + "reward_change_mean": -0.4831036403775215, + "reward_change_min": -0.8042481653392315, + "reward_change_std": 0.3272095564752817, + "reward_std": 0.6331440769135952, + "rewards/cosine_scaled_reward": -0.020831143483519554, + "rewards/format_reward": 0.6458333432674408, + "step": 471 + }, + { + "advantage_max": 1.9175452291965485, + "advantage_mean": 4.9670543234014986e-09, + "advantage_min": -0.7889066264033318, + "advantage_std": 0.999824769794941, + "completion_length": 1967.4583892822266, + "epoch": 0.5394285714285715, + "grad_norm": 0.9014610648155212, + "kl": 0.339630126953125, + "lambda_div_used": 0.6, + "learning_rate": 1.0857018009286381e-07, + "loss": 0.0136, + "reward": -0.0833109375089407, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.0833109375089407, + "reward_after_std": 0.6962088979780674, + "reward_before_mean": 0.2651079539209604, + "reward_before_std": 0.6203143000602722, + "reward_change_max": 0.0, + "reward_change_mean": -0.34841887280344963, + "reward_change_min": -0.6095849871635437, + "reward_change_std": 0.23636100441217422, + "reward_std": 0.6962089166045189, + "rewards/cosine_scaled_reward": -0.2007793728262186, + "rewards/format_reward": 0.6666666753590107, + "step": 472 + }, + { + "advantage_max": 1.7719455808401108, + "advantage_mean": -9.934107647602275e-09, + "advantage_min": -1.0141915827989578, + "advantage_std": 0.9998443275690079, + "completion_length": 1939.916732788086, + "epoch": 0.5405714285714286, + "grad_norm": 1.1787399053573608, + "kl": 0.34352874755859375, + "lambda_div_used": 0.6, + "learning_rate": 1.0797073717209013e-07, + "loss": 0.0137, + "reward": 0.030010550282895565, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.030010550282895565, + "reward_after_std": 0.7244163006544113, + "reward_before_mean": 0.4455965477973223, + "reward_before_std": 0.7282268181443214, + "reward_change_max": 0.0021120458841323853, + "reward_change_mean": -0.41558599285781384, + "reward_change_min": -0.7554572410881519, + "reward_change_std": 0.31058728136122227, + "reward_std": 0.7244163267314434, + "rewards/cosine_scaled_reward": -0.14178507030010223, + "rewards/format_reward": 0.7291666753590107, + "step": 473 + }, + { + "advantage_max": 1.7680951356887817, + "advantage_mean": -3.6011140736036396e-08, + "advantage_min": -1.0582480803132057, + "advantage_std": 0.999866709113121, + "completion_length": 1825.8750381469727, + "epoch": 0.5417142857142857, + "grad_norm": 1.1968833208084106, + "kl": 0.3557090759277344, + "lambda_div_used": 0.6, + "learning_rate": 1.0739283813397639e-07, + "loss": 0.0142, + "reward": 0.6759944395162165, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.6759944395162165, + "reward_after_std": 0.8907247744500637, + "reward_before_mean": 1.4209766387939453, + "reward_before_std": 0.8035724554210901, + "reward_change_max": 0.0008310303092002869, + "reward_change_mean": -0.7449822090566158, + "reward_change_min": -1.1608989238739014, + "reward_change_std": 0.4833258595317602, + "reward_std": 0.8907247819006443, + "rewards/cosine_scaled_reward": 0.31465496867895126, + "rewards/format_reward": 0.7916666828095913, + "step": 474 + }, + { + "advantage_max": 1.8225472569465637, + "advantage_mean": -5.5879357807597785e-09, + "advantage_min": -0.8803718611598015, + "advantage_std": 0.9998807609081268, + "completion_length": 1464.9375381469727, + "epoch": 0.5428571428571428, + "grad_norm": 1.0400774478912354, + "kl": 0.16465377807617188, + "lambda_div_used": 0.6, + "learning_rate": 1.068365111445064e-07, + "loss": 0.0066, + "reward": 0.40743301063776016, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.40743301063776016, + "reward_after_std": 0.963828545063734, + "reward_before_mean": 0.9846168980002403, + "reward_before_std": 0.9466955624520779, + "reward_change_max": 0.0, + "reward_change_mean": -0.5771838165819645, + "reward_change_min": -1.096062332391739, + "reward_change_std": 0.4055010573938489, + "reward_std": 0.963828556239605, + "rewards/cosine_scaled_reward": 0.05480840336531401, + "rewards/format_reward": 0.8750000055879354, + "step": 475 + }, + { + "advantage_max": 1.8622777611017227, + "advantage_mean": 2.4835269396561444e-09, + "advantage_min": -0.9705834239721298, + "advantage_std": 0.9998641908168793, + "completion_length": 1534.2708740234375, + "epoch": 0.544, + "grad_norm": 0.9736862778663635, + "kl": 0.16690826416015625, + "lambda_div_used": 0.6, + "learning_rate": 1.063017833182728e-07, + "loss": 0.0067, + "reward": 0.2875065142288804, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2875065142288804, + "reward_after_std": 0.8677309937775135, + "reward_before_mean": 0.8108938159421086, + "reward_before_std": 0.8140701726078987, + "reward_change_max": 0.0, + "reward_change_mean": -0.5233872570097446, + "reward_change_min": -0.9255200996994972, + "reward_change_std": 0.3573854472488165, + "reward_std": 0.8677310347557068, + "rewards/cosine_scaled_reward": -0.021636446937918663, + "rewards/format_reward": 0.8541666865348816, + "step": 476 + }, + { + "advantage_max": 1.8614348471164703, + "advantage_mean": -1.614292433060882e-08, + "advantage_min": -0.8287764713168144, + "advantage_std": 0.9998591169714928, + "completion_length": 1370.2500343322754, + "epoch": 0.5451428571428572, + "grad_norm": 0.8771942853927612, + "kl": 0.14801025390625, + "lambda_div_used": 0.6, + "learning_rate": 1.0578868071715544e-07, + "loss": 0.0059, + "reward": 0.35155012272298336, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.35155012272298336, + "reward_after_std": 0.872353557497263, + "reward_before_mean": 0.9064036831259727, + "reward_before_std": 0.7498830500990152, + "reward_change_max": 0.0010341629385948181, + "reward_change_mean": -0.5548535455018282, + "reward_change_min": -0.925615169107914, + "reward_change_std": 0.3508171420544386, + "reward_std": 0.872353583574295, + "rewards/cosine_scaled_reward": 0.05736849526874721, + "rewards/format_reward": 0.791666679084301, + "step": 477 + }, + { + "advantage_max": 1.912682831287384, + "advantage_mean": -4.967053324200776e-09, + "advantage_min": -0.8365165963768959, + "advantage_std": 0.9998528063297272, + "completion_length": 1837.791732788086, + "epoch": 0.5462857142857143, + "grad_norm": 1.1443203687667847, + "kl": 0.3565864562988281, + "lambda_div_used": 0.6, + "learning_rate": 1.0529722834905125e-07, + "loss": 0.0142, + "reward": 0.192441092338413, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.192441092338413, + "reward_after_std": 0.827689416706562, + "reward_before_mean": 0.6641485160216689, + "reward_before_std": 0.7144726626574993, + "reward_change_max": 0.0, + "reward_change_mean": -0.4717074781656265, + "reward_change_min": -0.7887590229511261, + "reward_change_std": 0.3077526353299618, + "reward_std": 0.8276894614100456, + "rewards/cosine_scaled_reward": -0.0116757289506495, + "rewards/format_reward": 0.687500013038516, + "step": 478 + }, + { + "advantage_max": 1.7926261574029922, + "advantage_mean": 2.1730862664881556e-09, + "advantage_min": -0.9882497265934944, + "advantage_std": 0.9998070746660233, + "completion_length": 2006.4792098999023, + "epoch": 0.5474285714285714, + "grad_norm": 1.2607219219207764, + "kl": 0.40653228759765625, + "lambda_div_used": 0.6, + "learning_rate": 1.0482745016665526e-07, + "loss": 0.0162, + "reward": 0.013332958100363612, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.013332958100363612, + "reward_after_std": 0.6395022124052048, + "reward_before_mean": 0.43424001708626747, + "reward_before_std": 0.6039809063076973, + "reward_change_max": 0.0, + "reward_change_mean": -0.42090705782175064, + "reward_change_min": -0.743909303098917, + "reward_change_std": 0.2912545781582594, + "reward_std": 0.6395022161304951, + "rewards/cosine_scaled_reward": -0.08496332913637161, + "rewards/format_reward": 0.6041666679084301, + "step": 479 + }, + { + "advantage_max": 1.8559697270393372, + "advantage_mean": 2.173086099954702e-09, + "advantage_min": -0.834226630628109, + "advantage_std": 0.999858595430851, + "completion_length": 1779.0833740234375, + "epoch": 0.5485714285714286, + "grad_norm": 1.2886638641357422, + "kl": 0.357391357421875, + "lambda_div_used": 0.6, + "learning_rate": 1.0437936906629334e-07, + "loss": 0.0143, + "reward": 0.138822834007442, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.138822834007442, + "reward_after_std": 0.9744543358683586, + "reward_before_mean": 0.557562917470932, + "reward_before_std": 0.9577418360859156, + "reward_change_max": 0.0003891661763191223, + "reward_change_mean": -0.41874008253216743, + "reward_change_min": -0.8996061198413372, + "reward_change_std": 0.34219310991466045, + "reward_std": 0.9744543917477131, + "rewards/cosine_scaled_reward": -0.0858018803410232, + "rewards/format_reward": 0.7291666772216558, + "step": 480 + }, + { + "advantage_max": 1.8721778094768524, + "advantage_mean": -2.4835269396561444e-09, + "advantage_min": -0.9300287738442421, + "advantage_std": 0.9998296350240707, + "completion_length": 1719.6459045410156, + "epoch": 0.5497142857142857, + "grad_norm": 0.6540005803108215, + "kl": 0.24334716796875, + "lambda_div_used": 0.6, + "learning_rate": 1.0395300688680625e-07, + "loss": 0.0097, + "reward": 0.038552841171622276, + "reward_advantage_correlation": 0.9999999999999996, + "reward_after_mean": 0.038552841171622276, + "reward_after_std": 0.6775308847427368, + "reward_before_mean": 0.4625336639583111, + "reward_before_std": 0.6034595742821693, + "reward_change_max": 0.0, + "reward_change_mean": -0.4239808265119791, + "reward_change_min": -0.7124708220362663, + "reward_change_std": 0.2699653413146734, + "reward_std": 0.6775308921933174, + "rewards/cosine_scaled_reward": -0.12289983592927456, + "rewards/format_reward": 0.7083333507180214, + "step": 481 + }, + { + "advantage_max": 1.7981848269701004, + "advantage_mean": 1.2417634698280722e-09, + "advantage_min": -0.9321806207299232, + "advantage_std": 0.9998157620429993, + "completion_length": 1528.458366394043, + "epoch": 0.5508571428571428, + "grad_norm": 1.0169956684112549, + "kl": 0.2531890869140625, + "lambda_div_used": 0.6, + "learning_rate": 1.0354838440848501e-07, + "loss": 0.0101, + "reward": 0.15139730274677277, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.15139730274677277, + "reward_after_std": 0.7655397467315197, + "reward_before_mean": 0.6284934259019792, + "reward_before_std": 0.7671113889664412, + "reward_change_max": 0.0007704496383666992, + "reward_change_mean": -0.477096114307642, + "reward_change_min": -0.9491989016532898, + "reward_change_std": 0.3575117578729987, + "reward_std": 0.7655397653579712, + "rewards/cosine_scaled_reward": -0.05033663008362055, + "rewards/format_reward": 0.7291666865348816, + "step": 482 + }, + { + "advantage_max": 1.9271509051322937, + "advantage_mean": -1.9247333948868572e-08, + "advantage_min": -0.8351431041955948, + "advantage_std": 0.9998330399394035, + "completion_length": 1956.7917022705078, + "epoch": 0.552, + "grad_norm": 0.846644401550293, + "kl": 0.28060150146484375, + "lambda_div_used": 0.6, + "learning_rate": 1.0316552135205837e-07, + "loss": 0.0112, + "reward": 0.07792708650231361, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.07792708650231361, + "reward_after_std": 0.8151919208467007, + "reward_before_mean": 0.48914189636707306, + "reward_before_std": 0.7106311805546284, + "reward_change_max": 0.0, + "reward_change_mean": -0.41121480986475945, + "reward_change_min": -0.7100636810064316, + "reward_change_std": 0.2673608586192131, + "reward_std": 0.8151919543743134, + "rewards/cosine_scaled_reward": -0.1200123907183297, + "rewards/format_reward": 0.7291666753590107, + "step": 483 + }, + { + "advantage_max": 1.803183764219284, + "advantage_mean": -1.8936892692833496e-08, + "advantage_min": -0.9652373790740967, + "advantage_std": 0.9998325034976006, + "completion_length": 1344.0417022705078, + "epoch": 0.5531428571428572, + "grad_norm": 1.1684057712554932, + "kl": 0.20900726318359375, + "lambda_div_used": 0.6, + "learning_rate": 1.0280443637773163e-07, + "loss": 0.0084, + "reward": 0.35994289815425873, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.35994289815425873, + "reward_after_std": 0.8187261484563351, + "reward_before_mean": 0.9432360231876373, + "reward_before_std": 0.7991822604089975, + "reward_change_max": 0.0, + "reward_change_mean": -0.5832931138575077, + "reward_change_min": -0.9661167785525322, + "reward_change_std": 0.39387095533311367, + "reward_std": 0.8187261670827866, + "rewards/cosine_scaled_reward": 0.04453466390259564, + "rewards/format_reward": 0.854166679084301, + "step": 484 + }, + { + "advantage_max": 1.8188410550355911, + "advantage_mean": -2.0489097307674342e-08, + "advantage_min": -0.9117425456643105, + "advantage_std": 0.999856524169445, + "completion_length": 1547.0834121704102, + "epoch": 0.5542857142857143, + "grad_norm": 1.74229097366333, + "kl": 0.2944793701171875, + "lambda_div_used": 0.6, + "learning_rate": 1.0246514708427701e-07, + "loss": 0.0118, + "reward": 0.23888829350471497, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.23888829350471497, + "reward_after_std": 0.7746790088713169, + "reward_before_mean": 0.7610771879553795, + "reward_before_std": 0.733045406639576, + "reward_change_max": 0.000652313232421875, + "reward_change_mean": -0.5221888944506645, + "reward_change_min": -0.8894664272665977, + "reward_change_std": 0.3509647063910961, + "reward_std": 0.7746790125966072, + "rewards/cosine_scaled_reward": -0.025711423717439175, + "rewards/format_reward": 0.8125000149011612, + "step": 485 + }, + { + "advantage_max": 1.8529712557792664, + "advantage_mean": -8.692343955729598e-09, + "advantage_min": -0.9172825664281845, + "advantage_std": 0.9998394474387169, + "completion_length": 901.3125305175781, + "epoch": 0.5554285714285714, + "grad_norm": 1.0345560312271118, + "kl": 0.11465072631835938, + "lambda_div_used": 0.6, + "learning_rate": 1.0214767000817596e-07, + "loss": 0.0046, + "reward": 0.291486918926239, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.291486918926239, + "reward_after_std": 0.7153438255190849, + "reward_before_mean": 0.8480473421514034, + "reward_before_std": 0.5964875835925341, + "reward_change_max": 0.0, + "reward_change_mean": -0.5565604045987129, + "reward_change_min": -0.9302693456411362, + "reward_change_std": 0.3404785916209221, + "reward_std": 0.7153438292443752, + "rewards/cosine_scaled_reward": -0.02389301359653473, + "rewards/format_reward": 0.8958333432674408, + "step": 486 + }, + { + "advantage_max": 1.8933127522468567, + "advantage_mean": 1.6763806898190126e-08, + "advantage_min": -0.8865720219910145, + "advantage_std": 0.9998225793242455, + "completion_length": 1088.0625286102295, + "epoch": 0.5565714285714286, + "grad_norm": 1.1974716186523438, + "kl": 0.109893798828125, + "lambda_div_used": 0.6, + "learning_rate": 1.0185202062281336e-07, + "loss": 0.0044, + "reward": 0.4667796200737939, + "reward_advantage_correlation": 0.9999999999999997, + "reward_after_mean": 0.4667796200737939, + "reward_after_std": 0.7072942927479744, + "reward_before_mean": 1.1232407530769706, + "reward_before_std": 0.5628406452015042, + "reward_change_max": 0.0, + "reward_change_mean": -0.6564611494541168, + "reward_change_min": -1.0229570604860783, + "reward_change_std": 0.3858571834862232, + "reward_std": 0.7072943113744259, + "rewards/cosine_scaled_reward": 0.11370370723307133, + "rewards/format_reward": 0.8958333432674408, + "step": 487 + }, + { + "advantage_max": 1.824654459953308, + "advantage_mean": 1.862645193639878e-08, + "advantage_min": -0.9079797342419624, + "advantage_std": 0.9998256489634514, + "completion_length": 1488.770866394043, + "epoch": 0.5577142857142857, + "grad_norm": 1.3172677755355835, + "kl": 0.33730316162109375, + "lambda_div_used": 0.6, + "learning_rate": 1.0157821333772304e-07, + "loss": 0.0135, + "reward": 0.13362685590982437, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.13362685590982437, + "reward_after_std": 0.6905847005546093, + "reward_before_mean": 0.6119700018316507, + "reward_before_std": 0.6388088949024677, + "reward_change_max": 0.0, + "reward_change_mean": -0.47834310680627823, + "reward_change_min": -0.8052879460155964, + "reward_change_std": 0.30606131348758936, + "reward_std": 0.6905847080051899, + "rewards/cosine_scaled_reward": -0.07943168503697962, + "rewards/format_reward": 0.7708333395421505, + "step": 488 + }, + { + "advantage_max": 1.9077175855636597, + "advantage_mean": 1.490116174895917e-08, + "advantage_min": -0.8542089462280273, + "advantage_std": 0.999797873198986, + "completion_length": 1636.25004196167, + "epoch": 0.5588571428571428, + "grad_norm": 1.2856013774871826, + "kl": 0.2859954833984375, + "lambda_div_used": 0.6, + "learning_rate": 1.013262614978859e-07, + "loss": 0.0114, + "reward": -0.1691692327876808, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": -0.1691692327876808, + "reward_after_std": 0.5174032747745514, + "reward_before_mean": 0.1689762193709612, + "reward_before_std": 0.4460158832371235, + "reward_change_max": 0.0016556233167648315, + "reward_change_mean": -0.3381454488262534, + "reward_change_min": -0.5612492710351944, + "reward_change_std": 0.20866003772243857, + "reward_std": 0.517403282225132, + "rewards/cosine_scaled_reward": -0.31134524568915367, + "rewards/format_reward": 0.7916666846722364, + "step": 489 + }, + { + "advantage_max": 1.9346612095832825, + "advantage_mean": 9.313225746154785e-09, + "advantage_min": -0.7759344056248665, + "advantage_std": 0.9998493641614914, + "completion_length": 1638.666690826416, + "epoch": 0.56, + "grad_norm": 0.999034583568573, + "kl": 0.37992095947265625, + "lambda_div_used": 0.6, + "learning_rate": 1.0109617738307911e-07, + "loss": 0.0152, + "reward": 0.2699229356367141, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2699229356367141, + "reward_after_std": 0.8593488521873951, + "reward_before_mean": 0.7801721151918173, + "reward_before_std": 0.7295130789279938, + "reward_change_max": 0.0, + "reward_change_mean": -0.5102491900324821, + "reward_change_min": -0.8690995946526527, + "reward_change_std": 0.322820819914341, + "reward_std": 0.8593488857150078, + "rewards/cosine_scaled_reward": -0.01616394752636552, + "rewards/format_reward": 0.8125000111758709, + "step": 490 + }, + { + "advantage_max": 1.8448296338319778, + "advantage_mean": 2.1109978987077227e-08, + "advantage_min": -0.8977131173014641, + "advantage_std": 0.9998864755034447, + "completion_length": 1702.6250686645508, + "epoch": 0.5611428571428572, + "grad_norm": 1.1440184116363525, + "kl": 0.2804298400878906, + "lambda_div_used": 0.6, + "learning_rate": 1.0088797220727779e-07, + "loss": 0.0112, + "reward": 0.40567315742373466, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.40567315742373466, + "reward_after_std": 0.9561508670449257, + "reward_before_mean": 0.9794415173455491, + "reward_before_std": 0.890728622674942, + "reward_change_max": 0.00013072043657302856, + "reward_change_mean": -0.5737683735787868, + "reward_change_min": -1.022655088454485, + "reward_change_std": 0.392275283113122, + "reward_std": 0.9561508968472481, + "rewards/cosine_scaled_reward": 0.10430408432148397, + "rewards/format_reward": 0.7708333432674408, + "step": 491 + }, + { + "advantage_max": 1.8528670370578766, + "advantage_mean": -8.265487849712372e-09, + "advantage_min": -0.9619855433702469, + "advantage_std": 0.9998434036970139, + "completion_length": 1238.020881652832, + "epoch": 0.5622857142857143, + "grad_norm": 1.146223783493042, + "kl": 0.2194671630859375, + "lambda_div_used": 0.6, + "learning_rate": 1.0070165611810855e-07, + "loss": 0.0088, + "reward": 0.17629980109632015, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.17629980109632015, + "reward_after_std": 0.7229134701192379, + "reward_before_mean": 0.664001327008009, + "reward_before_std": 0.6515200156718493, + "reward_change_max": 0.003348402678966522, + "reward_change_mean": -0.48770153522491455, + "reward_change_min": -0.759284932166338, + "reward_change_std": 0.31521399691700935, + "reward_std": 0.7229134850203991, + "rewards/cosine_scaled_reward": -0.02216600440442562, + "rewards/format_reward": 0.7083333358168602, + "step": 492 + }, + { + "advantage_max": 1.845920279622078, + "advantage_mean": -2.421438827227007e-08, + "advantage_min": -0.9289551004767418, + "advantage_std": 0.9998549744486809, + "completion_length": 1406.0417022705078, + "epoch": 0.5634285714285714, + "grad_norm": 1.184366226196289, + "kl": 0.219512939453125, + "lambda_div_used": 0.6, + "learning_rate": 1.005372381963547e-07, + "loss": 0.0088, + "reward": 0.3451455421745777, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3451455421745777, + "reward_after_std": 0.794960230588913, + "reward_before_mean": 0.914349190890789, + "reward_before_std": 0.6987705379724503, + "reward_change_max": 0.0, + "reward_change_mean": -0.5692036896944046, + "reward_change_min": -0.9593402594327927, + "reward_change_std": 0.36596825532615185, + "reward_std": 0.7949602715671062, + "rewards/cosine_scaled_reward": 0.019674593582749367, + "rewards/format_reward": 0.8750000149011612, + "step": 493 + }, + { + "advantage_max": 1.8998329788446426, + "advantage_mean": -8.07146260939362e-09, + "advantage_min": -0.7808930799365044, + "advantage_std": 0.9998975768685341, + "completion_length": 1432.9375381469727, + "epoch": 0.5645714285714286, + "grad_norm": 1.0612187385559082, + "kl": 0.2517433166503906, + "lambda_div_used": 0.6, + "learning_rate": 1.0039472645551372e-07, + "loss": 0.01, + "reward": 0.3418361786752939, + "reward_advantage_correlation": 1.0, + "reward_after_mean": 0.3418361786752939, + "reward_after_std": 1.0622280165553093, + "reward_before_mean": 0.8451494723558426, + "reward_before_std": 0.9670507088303566, + "reward_change_max": 0.0, + "reward_change_mean": -0.5033133029937744, + "reward_change_min": -0.9100279286503792, + "reward_change_std": 0.33215487375855446, + "reward_std": 1.0622280314564705, + "rewards/cosine_scaled_reward": -0.03575861267745495, + "rewards/format_reward": 0.9166666865348816, + "step": 494 + }, + { + "advantage_max": 1.8994756937026978, + "advantage_mean": -9.934107758624577e-09, + "advantage_min": -0.8431757986545563, + "advantage_std": 0.999841496348381, + "completion_length": 1628.3542098999023, + "epoch": 0.5657142857142857, + "grad_norm": 1.2267202138900757, + "kl": 0.2889556884765625, + "lambda_div_used": 0.6, + "learning_rate": 1.002741278414069e-07, + "loss": 0.0116, + "reward": 0.2072940650396049, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.2072940650396049, + "reward_after_std": 0.8287488594651222, + "reward_before_mean": 0.689543791115284, + "reward_before_std": 0.7430235873907804, + "reward_change_max": 0.0, + "reward_change_mean": -0.4822497144341469, + "reward_change_min": -0.8137565180659294, + "reward_change_std": 0.3069156575948, + "reward_std": 0.828748881816864, + "rewards/cosine_scaled_reward": -0.040644790045917034, + "rewards/format_reward": 0.7708333376795053, + "step": 495 + }, + { + "advantage_max": 1.8756288141012192, + "advantage_mean": -2.9414272129102415e-08, + "advantage_min": -0.8122064806520939, + "advantage_std": 0.9998170509934425, + "completion_length": 1611.4375076293945, + "epoch": 0.5668571428571428, + "grad_norm": 0.8443682193756104, + "kl": 0.2679557800292969, + "lambda_div_used": 0.6, + "learning_rate": 1.0017544823184055e-07, + "loss": 0.0107, + "reward": 0.3717600470408797, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.3717600470408797, + "reward_after_std": 0.6946693547070026, + "reward_before_mean": 0.9796913452446461, + "reward_before_std": 0.5551566991489381, + "reward_change_max": 9.44286584854126e-05, + "reward_change_mean": -0.6079313308000565, + "reward_change_min": -0.9462003000080585, + "reward_change_std": 0.37002974562346935, + "reward_std": 0.694669384509325, + "rewards/cosine_scaled_reward": 0.07317899935878813, + "rewards/format_reward": 0.8333333395421505, + "step": 496 + }, + { + "advantage_max": 1.8541768789291382, + "advantage_mean": -1.241763691872677e-09, + "advantage_min": -0.912086084485054, + "advantage_std": 0.9998577758669853, + "completion_length": 1330.3542175292969, + "epoch": 0.568, + "grad_norm": 0.8634279370307922, + "kl": 0.2939910888671875, + "lambda_div_used": 0.6, + "learning_rate": 1.0009869243631952e-07, + "loss": 0.0117, + "reward": 0.4358122395351529, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.4358122395351529, + "reward_after_std": 0.8337152674794197, + "reward_before_mean": 1.0514396652579308, + "reward_before_std": 0.7460405230522156, + "reward_change_max": 0.0, + "reward_change_mean": -0.6156274164095521, + "reward_change_min": -1.005971860140562, + "reward_change_std": 0.39167843107134104, + "reward_std": 0.8337152861058712, + "rewards/cosine_scaled_reward": 0.09863647632300854, + "rewards/format_reward": 0.8541666772216558, + "step": 497 + }, + { + "advantage_max": 1.8234520852565765, + "advantage_mean": 2.421438694000244e-08, + "advantage_min": -0.9808195158839226, + "advantage_std": 0.9998212158679962, + "completion_length": 1489.4792251586914, + "epoch": 0.5691428571428572, + "grad_norm": 1.3871195316314697, + "kl": 0.23724365234375, + "lambda_div_used": 0.6, + "learning_rate": 1.000438641958131e-07, + "loss": 0.0095, + "reward": 0.07590579707175493, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.07590579707175493, + "reward_after_std": 0.8010249249637127, + "reward_before_mean": 0.49792688991874456, + "reward_before_std": 0.7750258985906839, + "reward_change_max": 0.001229986548423767, + "reward_change_mean": -0.42202106304466724, + "reward_change_min": -0.7636589221656322, + "reward_change_std": 0.29837763123214245, + "reward_std": 0.801024928689003, + "rewards/cosine_scaled_reward": -0.09478656761348248, + "rewards/format_reward": 0.6875000204890966, + "step": 498 + }, + { + "advantage_max": 1.8711726665496826, + "advantage_mean": -1.7384688466570708e-08, + "advantage_min": -0.9095111042261124, + "advantage_std": 0.9998540803790092, + "completion_length": 1537.0416946411133, + "epoch": 0.5702857142857143, + "grad_norm": 1.4712544679641724, + "kl": 0.21712112426757812, + "lambda_div_used": 0.6, + "learning_rate": 1.0001096618257236e-07, + "loss": 0.0087, + "reward": 0.2286053616553545, + "reward_advantage_correlation": 0.9999999999999999, + "reward_after_mean": 0.2286053616553545, + "reward_after_std": 0.8291343785822392, + "reward_before_mean": 0.7263824446126819, + "reward_before_std": 0.7433257922530174, + "reward_change_max": 0.0, + "reward_change_mean": -0.4977770820260048, + "reward_change_min": -0.8215654790401459, + "reward_change_std": 0.3198694474995136, + "reward_std": 0.8291344232857227, + "rewards/cosine_scaled_reward": -0.04305878991726786, + "rewards/format_reward": 0.8125000111758709, + "step": 499 + }, + { + "advantage_max": 1.8898231089115143, + "advantage_mean": -9.934107980669182e-09, + "advantage_min": -0.8911553248763084, + "advantage_std": 0.9998495951294899, + "completion_length": 1224.2291793823242, + "epoch": 0.5714285714285714, + "grad_norm": 1.1404680013656616, + "kl": 0.22232437133789062, + "lambda_div_used": 0.6, + "learning_rate": 1e-07, + "loss": 0.0089, + "reward": 0.28170950431376696, + "reward_advantage_correlation": 0.9999999999999998, + "reward_after_mean": 0.28170950431376696, + "reward_after_std": 0.8098886050283909, + "reward_before_mean": 0.8126306012272835, + "reward_before_std": 0.7228812500834465, + "reward_change_max": 0.0, + "reward_change_mean": -0.5309210903942585, + "reward_change_min": -0.8794347941875458, + "reward_change_std": 0.3362457137554884, + "reward_std": 0.8098886422812939, + "rewards/cosine_scaled_reward": -0.04160138592123985, + "rewards/format_reward": 0.8958333507180214, + "step": 500 + }, + { + "epoch": 0.5714285714285714, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.0016669491224483837, + "train_runtime": 56169.988, + "train_samples_per_second": 0.427, + "train_steps_per_second": 0.009 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}