{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "advantage_max": 1.5351008400321007, "advantage_mean": 1.6142925107764938e-08, "advantage_min": -0.7533128149807453, "advantage_std": 0.8219119422137737, "completion_length": 2571.2083587646484, "epoch": 0.001142857142857143, "grad_norm": 0.11749051511287689, "kl": 0.0, "lambda_div_used": 0.5, "learning_rate": 2e-08, "loss": 0.0601, "reward": -0.03908593417145312, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.03908593417145312, "reward_after_std": 0.8219119422137737, "reward_before_mean": 0.4897647276520729, "reward_before_std": 0.8290339298546314, "reward_change_max": 0.0007017925381660461, "reward_change_mean": -0.5288506411015987, "reward_change_min": -1.0365500748157501, "reward_change_std": 0.4204680975526571, "reward_std": 0.8219119869172573, "rewards/cosine_scaled_reward": -0.015534311532974243, "rewards/format_reward": 0.5208333488553762, "step": 1 }, { "advantage_max": 0.9172319918870926, "advantage_mean": 1.8626452047421083e-08, "advantage_min": -0.43226177990436554, "advantage_std": 0.4922399129718542, "completion_length": 2804.395881652832, "epoch": 0.002285714285714286, "grad_norm": 0.06315363943576813, "kl": 0.0, "lambda_div_used": 0.5, "learning_rate": 4e-08, "loss": 0.0237, "reward": -0.21404163353145123, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21404163353145123, "reward_after_std": 0.4922399129718542, "reward_before_mean": 0.27539755403995514, "reward_before_std": 0.42092561535537243, "reward_change_max": 0.001632794737815857, "reward_change_mean": -0.48943919129669666, "reward_change_min": -0.7970554456114769, "reward_change_std": 0.3251637788489461, "reward_std": 0.4922399166971445, "rewards/cosine_scaled_reward": -0.04980122856795788, "rewards/format_reward": 0.37500000558793545, "step": 2 }, { "advantage_max": 0.9185556918382645, "advantage_mean": 1.8626452102932234e-08, "advantage_min": -0.4186497926712036, "advantage_std": 0.49275562539696693, "completion_length": 3346.6458740234375, "epoch": 0.0034285714285714284, "grad_norm": 0.08308498561382294, "kl": 4.464387893676758e-05, "lambda_div_used": 0.5, "learning_rate": 6e-08, "loss": 0.0288, "reward": -0.4993674159049988, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4993674159049988, "reward_after_std": 0.49275562167167664, "reward_before_mean": -0.2508781077340245, "reward_before_std": 0.5146373584866524, "reward_change_max": 0.0006987899541854858, "reward_change_mean": -0.24848931096494198, "reward_change_min": -0.5916367769241333, "reward_change_std": 0.23015559278428555, "reward_std": 0.49275562912225723, "rewards/cosine_scaled_reward": -0.18793905060738325, "rewards/format_reward": 0.1250000037252903, "step": 3 }, { "advantage_max": 1.6362370401620865, "advantage_mean": 2.4835269951672956e-09, "advantage_min": -0.692343682050705, "advantage_std": 0.8636160232126713, "completion_length": 2088.5208587646484, "epoch": 0.004571428571428572, "grad_norm": 0.14335113763809204, "kl": 4.871189594268799e-05, "lambda_div_used": 0.5, "learning_rate": 8e-08, "loss": 0.05, "reward": 0.0002058250829577446, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0002058250829577446, "reward_after_std": 0.8636160232126713, "reward_before_mean": 0.5427898876369, "reward_before_std": 0.8250438794493675, "reward_change_max": 0.0, "reward_change_mean": -0.5425840523093939, "reward_change_min": -1.0156159922480583, "reward_change_std": 0.4061375632882118, "reward_std": 0.8636160306632519, "rewards/cosine_scaled_reward": -0.061938409227877855, "rewards/format_reward": 0.6666666753590107, "step": 4 }, { "advantage_max": 1.400401022285223, "advantage_mean": -1.4280279680978225e-08, "advantage_min": -0.5432489290833473, "advantage_std": 0.726759284734726, "completion_length": 3397.4583435058594, "epoch": 0.005714285714285714, "grad_norm": 0.1702452301979065, "kl": 4.617869853973389e-05, "lambda_div_used": 0.5, "learning_rate": 1e-07, "loss": 0.0404, "reward": -0.4024368515238166, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4024368515238166, "reward_after_std": 0.7267592810094357, "reward_before_mean": -0.15027659479528666, "reward_before_std": 0.735253743827343, "reward_change_max": 0.0012489557266235352, "reward_change_mean": -0.25216028839349747, "reward_change_min": -0.6197184585034847, "reward_change_std": 0.25399384275078773, "reward_std": 0.7267593070864677, "rewards/cosine_scaled_reward": -0.179304969497025, "rewards/format_reward": 0.2083333358168602, "step": 5 }, { "advantage_max": 1.512929029762745, "advantage_mean": 9.934107758624577e-09, "advantage_min": -0.4930929020047188, "advantage_std": 0.7648793570697308, "completion_length": 3090.3333587646484, "epoch": 0.006857142857142857, "grad_norm": 0.14592202007770538, "kl": 4.194676876068115e-05, "lambda_div_used": 0.5, "learning_rate": 1.2e-07, "loss": 0.0437, "reward": -0.35319698275998235, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.35319698275998235, "reward_after_std": 0.7648793533444405, "reward_before_mean": -0.08074576873332262, "reward_before_std": 0.7240379452705383, "reward_change_max": 0.0034214183688163757, "reward_change_mean": -0.27245120890438557, "reward_change_min": -0.5420073866844177, "reward_change_std": 0.21257336740382016, "reward_std": 0.7648793831467628, "rewards/cosine_scaled_reward": -0.17578955832868814, "rewards/format_reward": 0.2708333358168602, "step": 6 }, { "advantage_max": 1.2516018003225327, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.6208689622581005, "advantage_std": 0.6690351134166121, "completion_length": 3066.7708587646484, "epoch": 0.008, "grad_norm": 0.0987529531121254, "kl": 2.1673738956451416e-05, "lambda_div_used": 0.5, "learning_rate": 1.4e-07, "loss": 0.0107, "reward": -0.22521874122321606, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.22521874122321606, "reward_after_std": 0.6690351143479347, "reward_before_mean": 0.20237011834979057, "reward_before_std": 0.6675894744694233, "reward_change_max": 0.00040956586599349976, "reward_change_mean": -0.42758889915421605, "reward_change_min": -0.8027475290000439, "reward_change_std": 0.3406977616250515, "reward_std": 0.6690351590514183, "rewards/cosine_scaled_reward": -0.18006493523716927, "rewards/format_reward": 0.5625000055879354, "step": 7 }, { "advantage_max": 1.648353137075901, "advantage_mean": -1.738468857759301e-08, "advantage_min": -0.6211307123303413, "advantage_std": 0.8522039614617825, "completion_length": 2743.3333892822266, "epoch": 0.009142857142857144, "grad_norm": 0.13271355628967285, "kl": 3.2164156436920166e-05, "lambda_div_used": 0.5, "learning_rate": 1.6e-07, "loss": 0.0114, "reward": 0.09091248735785484, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09091248735785484, "reward_after_std": 0.8522039651870728, "reward_before_mean": 0.7095784271750745, "reward_before_std": 0.7119280910119414, "reward_change_max": 0.0019819363951683044, "reward_change_mean": -0.6186658814549446, "reward_change_min": -1.01032629981637, "reward_change_std": 0.40532769449055195, "reward_std": 0.8522040024399757, "rewards/cosine_scaled_reward": 0.11520583834499121, "rewards/format_reward": 0.47916666977107525, "step": 8 }, { "advantage_max": 1.3571566194295883, "advantage_mean": -7.450580541412677e-09, "advantage_min": -0.6235329136252403, "advantage_std": 0.7175218462944031, "completion_length": 3166.604217529297, "epoch": 0.010285714285714285, "grad_norm": 0.1199144497513771, "kl": 4.5552849769592285e-05, "lambda_div_used": 0.5, "learning_rate": 1.8e-07, "loss": 0.0728, "reward": -0.2478786800056696, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2478786800056696, "reward_after_std": 0.7175218313932419, "reward_before_mean": 0.13642565836198628, "reward_before_std": 0.7206410467624664, "reward_change_max": 0.0011469870805740356, "reward_change_mean": -0.3843043278902769, "reward_change_min": -0.7558578066527843, "reward_change_std": 0.31058289762586355, "reward_std": 0.7175218351185322, "rewards/cosine_scaled_reward": -0.07762051094323397, "rewards/format_reward": 0.2916666753590107, "step": 9 }, { "advantage_max": 0.8546838238835335, "advantage_mean": 5.587935669737476e-09, "advantage_min": -0.44723255559802055, "advantage_std": 0.4613385181874037, "completion_length": 2646.687515258789, "epoch": 0.011428571428571429, "grad_norm": 0.05081493407487869, "kl": 2.671964466571808e-05, "lambda_div_used": 0.5, "learning_rate": 2e-07, "loss": 0.0109, "reward": -0.3753599179908633, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3753599179908633, "reward_after_std": 0.46133850887417793, "reward_before_mean": -0.012189794331789017, "reward_before_std": 0.4536908529698849, "reward_change_max": 0.0008832141757011414, "reward_change_mean": -0.36317012226209044, "reward_change_min": -0.6532888412475586, "reward_change_std": 0.2652512276545167, "reward_std": 0.46133851259946823, "rewards/cosine_scaled_reward": -0.1935949008911848, "rewards/format_reward": 0.37500000558793545, "step": 10 }, { "advantage_max": 1.4129405990242958, "advantage_mean": 1.3659398279131096e-08, "advantage_min": -0.5573535785079002, "advantage_std": 0.7481464147567749, "completion_length": 3410.3541870117188, "epoch": 0.012571428571428572, "grad_norm": 0.14011387526988983, "kl": 3.7103891372680664e-05, "lambda_div_used": 0.5, "learning_rate": 2.1999999999999998e-07, "loss": 0.0477, "reward": -0.3787131551653147, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3787131551653147, "reward_after_std": 0.7481464073061943, "reward_before_mean": -0.11150018568150699, "reward_before_std": 0.7933910526335239, "reward_change_max": 0.0027255788445472717, "reward_change_mean": -0.26721295714378357, "reward_change_min": -0.7168385572731495, "reward_change_std": 0.2900985237210989, "reward_std": 0.7481464371085167, "rewards/cosine_scaled_reward": -0.17033343017101288, "rewards/format_reward": 0.2291666716337204, "step": 11 }, { "advantage_max": 1.2863394618034363, "advantage_mean": -1.2417633032946185e-09, "advantage_min": -0.4983537904918194, "advantage_std": 0.6565282978117466, "completion_length": 2601.0834045410156, "epoch": 0.013714285714285714, "grad_norm": 0.08084140717983246, "kl": 3.930681850761175e-05, "lambda_div_used": 0.5, "learning_rate": 2.4e-07, "loss": 0.014, "reward": -0.05203055217862129, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05203055217862129, "reward_after_std": 0.6565283127129078, "reward_before_mean": 0.5094301174394786, "reward_before_std": 0.5098936408758163, "reward_change_max": 0.0, "reward_change_mean": -0.561460705474019, "reward_change_min": -0.8758684396743774, "reward_change_std": 0.3374221920967102, "reward_std": 0.656528327614069, "rewards/cosine_scaled_reward": -0.07861827686429024, "rewards/format_reward": 0.6666666716337204, "step": 12 }, { "advantage_max": 1.375102460384369, "advantage_mean": 1.5522043039783995e-08, "advantage_min": -0.6102774068713188, "advantage_std": 0.7467358633875847, "completion_length": 3022.5833587646484, "epoch": 0.014857142857142857, "grad_norm": 0.14866332709789276, "kl": 3.300607204437256e-05, "lambda_div_used": 0.5, "learning_rate": 2.6e-07, "loss": 0.0596, "reward": -0.22573862690478563, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.22573862690478563, "reward_after_std": 0.7467358633875847, "reward_before_mean": 0.17446773871779442, "reward_before_std": 0.7973835095763206, "reward_change_max": 0.0012500211596488953, "reward_change_mean": -0.40020634327083826, "reward_change_min": -0.9575489908456802, "reward_change_std": 0.3800930418074131, "reward_std": 0.746735867112875, "rewards/cosine_scaled_reward": -0.06901614367961884, "rewards/format_reward": 0.3125000037252903, "step": 13 }, { "advantage_max": 1.6066002435982227, "advantage_mean": 6.829698695476338e-09, "advantage_min": -0.5994165241718292, "advantage_std": 0.8288106508553028, "completion_length": 2782.5833892822266, "epoch": 0.016, "grad_norm": 0.17648428678512573, "kl": 3.495439887046814e-05, "lambda_div_used": 0.5, "learning_rate": 2.8e-07, "loss": 0.0814, "reward": -0.2345301266759634, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2345301266759634, "reward_after_std": 0.8288106359541416, "reward_before_mean": 0.12168530002236366, "reward_before_std": 0.7971222475171089, "reward_change_max": 0.0022324323654174805, "reward_change_mean": -0.35621544159948826, "reward_change_min": -0.7587636262178421, "reward_change_std": 0.3039133660495281, "reward_std": 0.828810652717948, "rewards/cosine_scaled_reward": -0.1266573565080762, "rewards/format_reward": 0.37500000558793545, "step": 14 }, { "advantage_max": 1.2302554100751877, "advantage_mean": 1.6763807453301638e-08, "advantage_min": -0.4796036444604397, "advantage_std": 0.6268669404089451, "completion_length": 2698.1041984558105, "epoch": 0.017142857142857144, "grad_norm": 0.08298429101705551, "kl": 2.790987491607666e-05, "lambda_div_used": 0.5, "learning_rate": 3e-07, "loss": 0.0127, "reward": -0.13335783407092094, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13335783407092094, "reward_after_std": 0.6268669404089451, "reward_before_mean": 0.37199926376342773, "reward_before_std": 0.4883376806974411, "reward_change_max": 0.0012028142809867859, "reward_change_mean": -0.5053570671007037, "reward_change_min": -0.7833587303757668, "reward_change_std": 0.3134935852140188, "reward_std": 0.6268669553101063, "rewards/cosine_scaled_reward": -0.022333701490424573, "rewards/format_reward": 0.4166666716337204, "step": 15 }, { "advantage_max": 0.58426259085536, "advantage_mean": 2.297262391426358e-08, "advantage_min": -0.2842987850308418, "advantage_std": 0.31660328805446625, "completion_length": 3563.5625, "epoch": 0.018285714285714287, "grad_norm": 0.0470166839659214, "kl": 3.844499588012695e-05, "lambda_div_used": 0.5, "learning_rate": 3.2e-07, "loss": 0.0043, "reward": -0.6709575429558754, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.6709575429558754, "reward_after_std": 0.3166032861918211, "reward_before_mean": -0.5079018995165825, "reward_before_std": 0.33846721425652504, "reward_change_max": 0.0015065670013427734, "reward_change_mean": -0.1630556397140026, "reward_change_min": -0.36466383934020996, "reward_change_std": 0.15320970956236124, "reward_std": 0.31660328805446625, "rewards/cosine_scaled_reward": -0.26436761766672134, "rewards/format_reward": 0.02083333395421505, "step": 16 }, { "advantage_max": 1.545443370938301, "advantage_mean": -9.934107758624577e-09, "advantage_min": -0.5599127858877182, "advantage_std": 0.8097105287015438, "completion_length": 2066.9375534057617, "epoch": 0.019428571428571427, "grad_norm": 0.1167609766125679, "kl": 2.705492079257965e-05, "lambda_div_used": 0.5, "learning_rate": 3.4000000000000003e-07, "loss": 0.0423, "reward": -0.019218791276216507, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.019218791276216507, "reward_after_std": 0.8097105361521244, "reward_before_mean": 0.5264171995222569, "reward_before_std": 0.7449488136917353, "reward_change_max": 0.0, "reward_change_mean": -0.5456360150128603, "reward_change_min": -1.1235605031251907, "reward_change_std": 0.39495558850467205, "reward_std": 0.8097105547785759, "rewards/cosine_scaled_reward": -0.03887474234215915, "rewards/format_reward": 0.6041666697710752, "step": 17 }, { "advantage_max": 1.6779102236032486, "advantage_mean": 9.93410786964688e-09, "advantage_min": -0.6428487151861191, "advantage_std": 0.862780749797821, "completion_length": 3074.3958892822266, "epoch": 0.02057142857142857, "grad_norm": 0.17488229274749756, "kl": 2.652546390891075e-05, "lambda_div_used": 0.5, "learning_rate": 3.6e-07, "loss": 0.0487, "reward": -0.19576671486720443, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.19576671486720443, "reward_after_std": 0.8627807684242725, "reward_before_mean": 0.18335069436579943, "reward_before_std": 0.8314383886754513, "reward_change_max": 0.0014890357851982117, "reward_change_mean": -0.3791173882782459, "reward_change_min": -0.7256179824471474, "reward_change_std": 0.28984588757157326, "reward_std": 0.8627808056771755, "rewards/cosine_scaled_reward": -0.08540799282491207, "rewards/format_reward": 0.35416667349636555, "step": 18 }, { "advantage_max": 2.1795709654688835, "advantage_mean": 1.862645149230957e-09, "advantage_min": -0.8414489030838013, "advantage_std": 1.1247562803328037, "completion_length": 3021.4583892822266, "epoch": 0.021714285714285714, "grad_norm": 0.20591142773628235, "kl": 2.354755997657776e-05, "lambda_div_used": 0.5, "learning_rate": 3.7999999999999996e-07, "loss": 0.0303, "reward": 0.12146518751978874, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12146518751978874, "reward_after_std": 1.124756295233965, "reward_before_mean": 0.6802486801752821, "reward_before_std": 1.0683305263519287, "reward_change_max": 0.0007704496383666992, "reward_change_mean": -0.5587835060432553, "reward_change_min": -1.0479483902454376, "reward_change_std": 0.41728440672159195, "reward_std": 1.1247563175857067, "rewards/cosine_scaled_reward": 0.13179101014975458, "rewards/format_reward": 0.41666667722165585, "step": 19 }, { "advantage_max": 1.4769879020750523, "advantage_mean": -1.8626453157644107e-09, "advantage_min": -0.6367698088288307, "advantage_std": 0.7875895667821169, "completion_length": 2265.1666946411133, "epoch": 0.022857142857142857, "grad_norm": 0.12089281529188156, "kl": 1.477077603340149e-05, "lambda_div_used": 0.5, "learning_rate": 4e-07, "loss": 0.0535, "reward": 0.04904268682003021, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04904268682003021, "reward_after_std": 0.787589592859149, "reward_before_mean": 0.658054169267416, "reward_before_std": 0.7119925869628787, "reward_change_max": 0.0, "reward_change_mean": -0.6090115122497082, "reward_change_min": -1.0665437504649162, "reward_change_std": 0.43563584610819817, "reward_std": 0.7875896263867617, "rewards/cosine_scaled_reward": -0.025139580480754375, "rewards/format_reward": 0.7083333432674408, "step": 20 }, { "advantage_max": 1.2946437485516071, "advantage_mean": 1.117587122845265e-08, "advantage_min": -0.6569844149053097, "advantage_std": 0.689958855509758, "completion_length": 2700.2916870117188, "epoch": 0.024, "grad_norm": 0.0916251540184021, "kl": 3.0294060707092285e-05, "lambda_div_used": 0.5, "learning_rate": 4.1999999999999995e-07, "loss": 0.0295, "reward": -0.12843544664792717, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12843544664792717, "reward_after_std": 0.6899588741362095, "reward_before_mean": 0.3676719907671213, "reward_before_std": 0.673046289011836, "reward_change_max": 0.0006017163395881653, "reward_change_mean": -0.49610744789242744, "reward_change_min": -0.898295234888792, "reward_change_std": 0.35666080191731453, "reward_std": 0.6899588778614998, "rewards/cosine_scaled_reward": -0.024497329257428646, "rewards/format_reward": 0.4166666716337204, "step": 21 }, { "advantage_max": 1.4664242267608643, "advantage_mean": -2.6077032533322608e-08, "advantage_min": -0.6705172508955002, "advantage_std": 0.7772407494485378, "completion_length": 1715.1666870117188, "epoch": 0.025142857142857144, "grad_norm": 0.09739455580711365, "kl": 1.611793413758278e-05, "lambda_div_used": 0.5, "learning_rate": 4.3999999999999997e-07, "loss": -0.0015, "reward": 0.16622705105692148, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16622705105692148, "reward_after_std": 0.7772407718002796, "reward_before_mean": 0.8788988022133708, "reward_before_std": 0.6709907222539186, "reward_change_max": 0.0, "reward_change_mean": -0.7126717567443848, "reward_change_min": -1.252291426062584, "reward_change_std": 0.47464586794376373, "reward_std": 0.7772407941520214, "rewards/cosine_scaled_reward": 0.054032716900110245, "rewards/format_reward": 0.7708333395421505, "step": 22 }, { "advantage_max": 1.510243035852909, "advantage_mean": 1.2417632477834672e-09, "advantage_min": -0.6073393523693085, "advantage_std": 0.7824924364686012, "completion_length": 2540.9166870117188, "epoch": 0.026285714285714287, "grad_norm": 0.09775416553020477, "kl": 2.3233238607645035e-05, "lambda_div_used": 0.5, "learning_rate": 4.6e-07, "loss": 0.0514, "reward": -0.19087476283311844, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.19087476283311844, "reward_after_std": 0.7824924141168594, "reward_before_mean": 0.21587533503770828, "reward_before_std": 0.7406318411231041, "reward_change_max": 0.0006575360894203186, "reward_change_mean": -0.4067500773817301, "reward_change_min": -0.8022553063929081, "reward_change_std": 0.3064673077315092, "reward_std": 0.782492458820343, "rewards/cosine_scaled_reward": -0.12122901016846299, "rewards/format_reward": 0.4583333395421505, "step": 23 }, { "advantage_max": 1.8014312759041786, "advantage_mean": 4.967053990334591e-09, "advantage_min": -0.9172608628869057, "advantage_std": 0.9784793332219124, "completion_length": 2844.6459045410156, "epoch": 0.027428571428571427, "grad_norm": 0.15517903864383698, "kl": 2.2795749828219414e-05, "lambda_div_used": 0.5, "learning_rate": 4.8e-07, "loss": 0.0567, "reward": 0.015449069440364838, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.015449069440364838, "reward_after_std": 0.9784793667495251, "reward_before_mean": 0.5478918794542551, "reward_before_std": 1.0509838238358498, "reward_change_max": 0.0003214925527572632, "reward_change_mean": -0.5324427876621485, "reward_change_min": -1.174141988158226, "reward_change_std": 0.4900853671133518, "reward_std": 0.9784793853759766, "rewards/cosine_scaled_reward": 0.03436260763555765, "rewards/format_reward": 0.4791666828095913, "step": 24 }, { "advantage_max": 1.2840170040726662, "advantage_mean": 5.587935614226325e-09, "advantage_min": -0.6295462027192116, "advantage_std": 0.680108830332756, "completion_length": 2679.562530517578, "epoch": 0.02857142857142857, "grad_norm": 0.09606768935918808, "kl": 2.7861446142196655e-05, "lambda_div_used": 0.5, "learning_rate": 5e-07, "loss": -0.0088, "reward": -0.2081345096230507, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2081345096230507, "reward_after_std": 0.6801088340580463, "reward_before_mean": 0.21977244596928358, "reward_before_std": 0.6630591489374638, "reward_change_max": 0.000768907368183136, "reward_change_mean": -0.42790696490556, "reward_change_min": -0.7866359949111938, "reward_change_std": 0.3270989526063204, "reward_std": 0.6801088377833366, "rewards/cosine_scaled_reward": -0.10886377561837435, "rewards/format_reward": 0.43750001303851604, "step": 25 }, { "advantage_max": 1.1658898368477821, "advantage_mean": -7.450581041013038e-09, "advantage_min": -0.5475185066461563, "advantage_std": 0.6162115931510925, "completion_length": 2960.6458740234375, "epoch": 0.029714285714285714, "grad_norm": 0.07693036645650864, "kl": 2.6431865990161896e-05, "lambda_div_used": 0.5, "learning_rate": 5.2e-07, "loss": 0.0079, "reward": -0.09086161851882935, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.09086161851882935, "reward_after_std": 0.6162116080522537, "reward_before_mean": 0.4600065350532532, "reward_before_std": 0.5263635087758303, "reward_change_max": 0.0, "reward_change_mean": -0.5508681740611792, "reward_change_min": -0.8693654052913189, "reward_change_std": 0.3519774377346039, "reward_std": 0.6162116266787052, "rewards/cosine_scaled_reward": 0.011253247037529945, "rewards/format_reward": 0.4375000074505806, "step": 26 }, { "advantage_max": 1.5078487060964108, "advantage_mean": 1.3814618671226242e-08, "advantage_min": -0.6412480399012566, "advantage_std": 0.8038202319294214, "completion_length": 2972.000030517578, "epoch": 0.030857142857142857, "grad_norm": 0.13779900968074799, "kl": 1.5037134289741516e-05, "lambda_div_used": 0.5, "learning_rate": 5.4e-07, "loss": 0.0427, "reward": -0.1060659121721983, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1060659121721983, "reward_after_std": 0.8038202319294214, "reward_before_mean": 0.37545553129166365, "reward_before_std": 0.8012076523154974, "reward_change_max": 0.0, "reward_change_mean": -0.4815214276313782, "reward_change_min": -0.9643566869199276, "reward_change_std": 0.3837998528033495, "reward_std": 0.803820263594389, "rewards/cosine_scaled_reward": -0.04143891017884016, "rewards/format_reward": 0.45833334140479565, "step": 27 }, { "advantage_max": 1.7663781940937042, "advantage_mean": -7.45058070794613e-09, "advantage_min": -0.7497513294219971, "advantage_std": 0.9309034496545792, "completion_length": 2767.375030517578, "epoch": 0.032, "grad_norm": 0.11623068898916245, "kl": 2.4037901312112808e-05, "lambda_div_used": 0.5, "learning_rate": 5.6e-07, "loss": 0.0502, "reward": -0.036286541260778904, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.036286541260778904, "reward_after_std": 0.930903472006321, "reward_before_mean": 0.46089379489421844, "reward_before_std": 0.9374263137578964, "reward_change_max": 0.0007840320467948914, "reward_change_mean": -0.4971803342923522, "reward_change_min": -1.002158623188734, "reward_change_std": 0.40878670290112495, "reward_std": 0.9309034869074821, "rewards/cosine_scaled_reward": -0.009136438369750977, "rewards/format_reward": 0.47916667349636555, "step": 28 }, { "advantage_max": 1.0969897732138634, "advantage_mean": 1.6142925329809543e-08, "advantage_min": -0.45380792766809464, "advantage_std": 0.5798213481903076, "completion_length": 3435.354217529297, "epoch": 0.03314285714285714, "grad_norm": 0.11971724778413773, "kl": 1.8674880266189575e-05, "lambda_div_used": 0.5, "learning_rate": 5.8e-07, "loss": 0.0462, "reward": -0.4915079523343593, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4915079523343593, "reward_after_std": 0.5798213705420494, "reward_before_mean": -0.26571009401232004, "reward_before_std": 0.6099032871425152, "reward_change_max": 0.0003291517496109009, "reward_change_mean": -0.2257978618144989, "reward_change_min": -0.5952579416334629, "reward_change_std": 0.234037593472749, "reward_std": 0.57982137799263, "rewards/cosine_scaled_reward": -0.2161883795633912, "rewards/format_reward": 0.1666666716337204, "step": 29 }, { "advantage_max": 2.382244497537613, "advantage_mean": -6.208815683805824e-10, "advantage_min": -0.8910982012748718, "advantage_std": 1.2278216630220413, "completion_length": 3001.479232788086, "epoch": 0.03428571428571429, "grad_norm": 0.21962173283100128, "kl": 1.8961261957883835e-05, "lambda_div_used": 0.5, "learning_rate": 6e-07, "loss": 0.0829, "reward": -0.011109492421383038, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.011109492421383038, "reward_after_std": 1.2278216481208801, "reward_before_mean": 0.4033150505274534, "reward_before_std": 1.2304754592478275, "reward_change_max": 0.00011484324932098389, "reward_change_mean": -0.4144245618954301, "reward_change_min": -0.9832869358360767, "reward_change_std": 0.39574938639998436, "reward_std": 1.2278216779232025, "rewards/cosine_scaled_reward": -0.017092485912144184, "rewards/format_reward": 0.4375000111758709, "step": 30 }, { "advantage_max": 1.7745858430862427, "advantage_mean": 1.8005570368018198e-08, "advantage_min": -0.7045384347438812, "advantage_std": 0.9255342036485672, "completion_length": 2991.625045776367, "epoch": 0.03542857142857143, "grad_norm": 0.1190023124217987, "kl": 1.7091631889343262e-05, "lambda_div_used": 0.5, "learning_rate": 6.2e-07, "loss": 0.0341, "reward": -0.016999364597722888, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.016999364597722888, "reward_after_std": 0.925534226000309, "reward_before_mean": 0.4908587606623769, "reward_before_std": 0.8883664254099131, "reward_change_max": 0.00013752281665802002, "reward_change_mean": -0.5078581348061562, "reward_change_min": -1.0069617629051208, "reward_change_std": 0.3849845137447119, "reward_std": 0.9255342334508896, "rewards/cosine_scaled_reward": 0.037096042186021805, "rewards/format_reward": 0.41666666977107525, "step": 31 }, { "advantage_max": 1.1209936514496803, "advantage_mean": -9.93410786964688e-09, "advantage_min": -0.5845097005367279, "advantage_std": 0.6096477992832661, "completion_length": 3136.5833435058594, "epoch": 0.036571428571428574, "grad_norm": 0.10291552543640137, "kl": 3.565289080142975e-05, "lambda_div_used": 0.5, "learning_rate": 6.4e-07, "loss": 0.0208, "reward": -0.18031363934278488, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.18031363934278488, "reward_after_std": 0.6096478067338467, "reward_before_mean": 0.30608561262488365, "reward_before_std": 0.6181067805737257, "reward_change_max": 0.0007964596152305603, "reward_change_mean": -0.48639929108321667, "reward_change_min": -0.8966321311891079, "reward_change_std": 0.355242476798594, "reward_std": 0.609647810459137, "rewards/cosine_scaled_reward": -0.024040542542934418, "rewards/format_reward": 0.3541666716337204, "step": 32 }, { "advantage_max": 1.741414237767458, "advantage_mean": 4.967053657267684e-09, "advantage_min": -0.6895862258970737, "advantage_std": 0.9135888814926147, "completion_length": 3273.8750610351562, "epoch": 0.037714285714285714, "grad_norm": 0.1362524926662445, "kl": 2.4221837520599365e-05, "lambda_div_used": 0.5, "learning_rate": 6.6e-07, "loss": 0.0346, "reward": -0.21659856289625168, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21659856289625168, "reward_after_std": 0.913588859140873, "reward_before_mean": 0.13308776635676622, "reward_before_std": 0.9547240622341633, "reward_change_max": 0.0021802037954330444, "reward_change_mean": -0.34968633856624365, "reward_change_min": -0.6969276033341885, "reward_change_std": 0.3053355095908046, "reward_std": 0.9135888814926147, "rewards/cosine_scaled_reward": -0.0897061238065362, "rewards/format_reward": 0.31250000931322575, "step": 33 }, { "advantage_max": 1.7108439281582832, "advantage_mean": 3.725290353973065e-09, "advantage_min": -0.7600312046706676, "advantage_std": 0.8962564840912819, "completion_length": 2601.0208892822266, "epoch": 0.038857142857142854, "grad_norm": 0.1575985848903656, "kl": 9.97595489025116e-05, "lambda_div_used": 0.5, "learning_rate": 6.800000000000001e-07, "loss": 0.0179, "reward": 0.06414370238780975, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06414370238780975, "reward_after_std": 0.8962565250694752, "reward_before_mean": 0.649968970566988, "reward_before_std": 0.8475272879004478, "reward_change_max": 0.0, "reward_change_mean": -0.5858252439647913, "reward_change_min": -1.081942304968834, "reward_change_std": 0.4181617796421051, "reward_std": 0.8962565287947655, "rewards/cosine_scaled_reward": 0.07498448248952627, "rewards/format_reward": 0.5000000111758709, "step": 34 }, { "advantage_max": 1.5702840462327003, "advantage_mean": 2.545615118698663e-08, "advantage_min": -0.6366428211331367, "advantage_std": 0.8293478488922119, "completion_length": 3007.437530517578, "epoch": 0.04, "grad_norm": 0.12913337349891663, "kl": 4.92781400680542e-05, "lambda_div_used": 0.5, "learning_rate": 7e-07, "loss": 0.0438, "reward": -0.24781744490610436, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.24781744490610436, "reward_after_std": 0.8293478675186634, "reward_before_mean": 0.10291525349020958, "reward_before_std": 0.8493822924792767, "reward_change_max": 0.0025837719440460205, "reward_change_mean": -0.350732677616179, "reward_change_min": -0.8603302016854286, "reward_change_std": 0.34615642856806517, "reward_std": 0.8293478898704052, "rewards/cosine_scaled_reward": -0.10479237232357264, "rewards/format_reward": 0.3125000074505806, "step": 35 }, { "advantage_max": 0.7410223707556725, "advantage_mean": 1.8626452158443385e-08, "advantage_min": -0.31461289897561073, "advantage_std": 0.39646704867482185, "completion_length": 3507.7708435058594, "epoch": 0.04114285714285714, "grad_norm": 0.07298173755407333, "kl": 6.047636270523071e-05, "lambda_div_used": 0.5, "learning_rate": 7.2e-07, "loss": 0.0098, "reward": -0.6106727570295334, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.6106727570295334, "reward_after_std": 0.3964670542627573, "reward_before_mean": -0.42390722688287497, "reward_before_std": 0.4142027962952852, "reward_change_max": 0.0024718865752220154, "reward_change_mean": -0.18676553736440837, "reward_change_min": -0.42391372472047806, "reward_change_std": 0.17731763934716582, "reward_std": 0.39646706730127335, "rewards/cosine_scaled_reward": -0.26403695228509605, "rewards/format_reward": 0.1041666679084301, "step": 36 }, { "advantage_max": 0.8747889474034309, "advantage_mean": 6.208817349140361e-09, "advantage_min": -0.40808849036693573, "advantage_std": 0.4734865203499794, "completion_length": 3295.0833435058594, "epoch": 0.04228571428571429, "grad_norm": 0.06901846081018448, "kl": 3.345310688018799e-05, "lambda_div_used": 0.5, "learning_rate": 7.4e-07, "loss": 0.0204, "reward": -0.4728499799966812, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4728499799966812, "reward_after_std": 0.4734865240752697, "reward_before_mean": -0.1951841153204441, "reward_before_std": 0.501446358859539, "reward_change_max": 0.0024179965257644653, "reward_change_mean": -0.2776658684015274, "reward_change_min": -0.5783885680139065, "reward_change_std": 0.24316345155239105, "reward_std": 0.4734865352511406, "rewards/cosine_scaled_reward": -0.2017587386071682, "rewards/format_reward": 0.2083333358168602, "step": 37 }, { "advantage_max": 0.7613074332475662, "advantage_mean": 1.2417634698280722e-08, "advantage_min": -0.42001959681510925, "advantage_std": 0.4193967394530773, "completion_length": 3289.9375, "epoch": 0.04342857142857143, "grad_norm": 0.05465118587017059, "kl": 6.181374192237854e-05, "lambda_div_used": 0.5, "learning_rate": 7.599999999999999e-07, "loss": 0.0025, "reward": -0.48476463556289673, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.48476463556289673, "reward_after_std": 0.419396735727787, "reward_before_mean": -0.19598917290568352, "reward_before_std": 0.4489164873957634, "reward_change_max": 0.0023155659437179565, "reward_change_mean": -0.28877546079456806, "reward_change_min": -0.5613481365144253, "reward_change_std": 0.23745440039783716, "reward_std": 0.4193967394530773, "rewards/cosine_scaled_reward": -0.17091125436127186, "rewards/format_reward": 0.14583333395421505, "step": 38 }, { "advantage_max": 1.2476415075361729, "advantage_mean": 1.303851654421706e-08, "advantage_min": -0.5395600944757462, "advantage_std": 0.6443872451782227, "completion_length": 2855.854217529297, "epoch": 0.044571428571428574, "grad_norm": 0.09057861566543579, "kl": 0.00010902388021349907, "lambda_div_used": 0.5, "learning_rate": 7.799999999999999e-07, "loss": 0.0132, "reward": -0.06797056319192052, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.06797056319192052, "reward_after_std": 0.6443872451782227, "reward_before_mean": 0.4895414039492607, "reward_before_std": 0.5221749180927873, "reward_change_max": 0.0, "reward_change_mean": -0.5575119545683265, "reward_change_min": -0.824012566357851, "reward_change_std": 0.33742869179695845, "reward_std": 0.6443872600793839, "rewards/cosine_scaled_reward": -0.00522929901489988, "rewards/format_reward": 0.5000000111758709, "step": 39 }, { "advantage_max": 1.4988975450396538, "advantage_mean": -4.967053435223079e-09, "advantage_min": -0.6433758027851582, "advantage_std": 0.7838446795940399, "completion_length": 2304.4167098999023, "epoch": 0.045714285714285714, "grad_norm": 0.10686811059713364, "kl": 0.0002674385905265808, "lambda_div_used": 0.5, "learning_rate": 8e-07, "loss": 0.0178, "reward": -0.020974524319171906, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.020974524319171906, "reward_after_std": 0.7838446721434593, "reward_before_mean": 0.5286793448030949, "reward_before_std": 0.7248776257038116, "reward_change_max": 0.0, "reward_change_mean": -0.5496538653969765, "reward_change_min": -0.9812478795647621, "reward_change_std": 0.3794911988079548, "reward_std": 0.7838447019457817, "rewards/cosine_scaled_reward": -0.058577004820108414, "rewards/format_reward": 0.645833345130086, "step": 40 }, { "advantage_max": 1.211952231824398, "advantage_mean": 9.934107980669182e-09, "advantage_min": -0.6004071980714798, "advantage_std": 0.6527913548052311, "completion_length": 2990.8958740234375, "epoch": 0.046857142857142854, "grad_norm": 0.11691441386938095, "kl": 8.746236562728882e-05, "lambda_div_used": 0.5, "learning_rate": 8.199999999999999e-07, "loss": 0.0022, "reward": -0.2903154147788882, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2903154147788882, "reward_after_std": 0.6527913585305214, "reward_before_mean": 0.0847369134426117, "reward_before_std": 0.6745379194617271, "reward_change_max": 0.0022248774766921997, "reward_change_mean": -0.3750523333437741, "reward_change_min": -0.7458742596209049, "reward_change_std": 0.31590295769274235, "reward_std": 0.6527913697063923, "rewards/cosine_scaled_reward": -0.16596487676724792, "rewards/format_reward": 0.4166666753590107, "step": 41 }, { "advantage_max": 0.7290397174656391, "advantage_mean": 1.6142925440831846e-08, "advantage_min": -0.30367110669612885, "advantage_std": 0.38293253630399704, "completion_length": 2846.270839691162, "epoch": 0.048, "grad_norm": 0.0523265041410923, "kl": 6.805360317230225e-05, "lambda_div_used": 0.5, "learning_rate": 8.399999999999999e-07, "loss": 0.0103, "reward": -0.5473397932946682, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5473397932946682, "reward_after_std": 0.38293252885341644, "reward_before_mean": -0.30693099461495876, "reward_before_std": 0.3625225257128477, "reward_change_max": 0.002422921359539032, "reward_change_mean": -0.24040879029780626, "reward_change_min": -0.4838433638215065, "reward_change_std": 0.1895816596224904, "reward_std": 0.38293253630399704, "rewards/cosine_scaled_reward": -0.29929884150624275, "rewards/format_reward": 0.2916666679084301, "step": 42 }, { "advantage_max": 1.2590354792773724, "advantage_mean": 6.829699361610153e-09, "advantage_min": -0.5609464049339294, "advantage_std": 0.653152123093605, "completion_length": 3096.708366394043, "epoch": 0.04914285714285714, "grad_norm": 0.11091286689043045, "kl": 6.282329559326172e-05, "lambda_div_used": 0.5, "learning_rate": 8.599999999999999e-07, "loss": -0.0118, "reward": -0.2702128039672971, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2702128039672971, "reward_after_std": 0.653152123093605, "reward_before_mean": 0.1145242340862751, "reward_before_std": 0.6108989454805851, "reward_change_max": 0.0005005002021789551, "reward_change_mean": -0.3847370594739914, "reward_change_min": -0.6752089485526085, "reward_change_std": 0.2768531898036599, "reward_std": 0.6531521566212177, "rewards/cosine_scaled_reward": -0.06773788295686245, "rewards/format_reward": 0.25, "step": 43 }, { "advantage_max": 1.2386278919875622, "advantage_mean": -7.450580763457282e-09, "advantage_min": -0.5172981098294258, "advantage_std": 0.6598590333014727, "completion_length": 2765.3542098999023, "epoch": 0.05028571428571429, "grad_norm": 0.11159113794565201, "kl": 0.00022670626640319824, "lambda_div_used": 0.5, "learning_rate": 8.799999999999999e-07, "loss": 0.0116, "reward": -0.055530715733766556, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.055530715733766556, "reward_after_std": 0.6598590407520533, "reward_before_mean": 0.5103710060939193, "reward_before_std": 0.5851284889504313, "reward_change_max": 0.0, "reward_change_mean": -0.5659017502330244, "reward_change_min": -0.9489397816359997, "reward_change_std": 0.38827237067744136, "reward_std": 0.6598590649664402, "rewards/cosine_scaled_reward": 0.0364355007186532, "rewards/format_reward": 0.43750000558793545, "step": 44 }, { "advantage_max": 1.0946906879544258, "advantage_mean": 1.0554989549049765e-08, "advantage_min": -0.48534848541021347, "advantage_std": 0.5852056853473186, "completion_length": 3439.000030517578, "epoch": 0.05142857142857143, "grad_norm": 0.0995851531624794, "kl": 0.00010732375085353851, "lambda_div_used": 0.5, "learning_rate": 9e-07, "loss": 0.017, "reward": -0.437751529738307, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.437751529738307, "reward_after_std": 0.5852056816220284, "reward_before_mean": -0.16415877640247345, "reward_before_std": 0.6127915233373642, "reward_change_max": 0.001485571265220642, "reward_change_mean": -0.27359276497736573, "reward_change_min": -0.6492436826229095, "reward_change_std": 0.2626556186005473, "reward_std": 0.5852056965231895, "rewards/cosine_scaled_reward": -0.17582939192652702, "rewards/format_reward": 0.1875000037252903, "step": 45 }, { "advantage_max": 0.6511989608407021, "advantage_mean": 2.23517424569053e-08, "advantage_min": -0.3454531729221344, "advantage_std": 0.3550384156405926, "completion_length": 3257.6041717529297, "epoch": 0.052571428571428575, "grad_norm": 0.052597712725400925, "kl": 0.0002651810646057129, "lambda_div_used": 0.5, "learning_rate": 9.2e-07, "loss": 0.0014, "reward": -0.5496700319345109, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.5496700319345109, "reward_after_std": 0.3550384044647217, "reward_before_mean": -0.2950664572417736, "reward_before_std": 0.3601315375417471, "reward_change_max": 0.0012530013918876648, "reward_change_mean": -0.25460357405245304, "reward_change_min": -0.49163252115249634, "reward_change_std": 0.20036437083035707, "reward_std": 0.35503840632736683, "rewards/cosine_scaled_reward": -0.2204498965293169, "rewards/format_reward": 0.14583333395421505, "step": 46 }, { "advantage_max": 1.4671212919056416, "advantage_mean": -8.071462664904772e-09, "advantage_min": -0.8597921542823315, "advantage_std": 0.8293537385761738, "completion_length": 2995.979217529297, "epoch": 0.053714285714285714, "grad_norm": 0.14203596115112305, "kl": 9.060092270374298e-05, "lambda_div_used": 0.5, "learning_rate": 9.399999999999999e-07, "loss": 0.0581, "reward": 0.15175165981054306, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.15175165981054306, "reward_after_std": 0.8293537385761738, "reward_before_mean": 0.8535946235060692, "reward_before_std": 0.8959902841597795, "reward_change_max": 0.0, "reward_change_mean": -0.7018429655581713, "reward_change_min": -1.2576069086790085, "reward_change_std": 0.5387048032134771, "reward_std": 0.8293537646532059, "rewards/cosine_scaled_reward": 0.16638064198195934, "rewards/format_reward": 0.5208333507180214, "step": 47 }, { "advantage_max": 1.550131119787693, "advantage_mean": 1.7384689132704523e-08, "advantage_min": -0.7010627537965775, "advantage_std": 0.8304248489439487, "completion_length": 2845.750030517578, "epoch": 0.054857142857142854, "grad_norm": 0.11722289025783539, "kl": 0.0005335649475455284, "lambda_div_used": 0.5, "learning_rate": 9.6e-07, "loss": 0.0032, "reward": -0.13437421433627605, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13437421433627605, "reward_after_std": 0.8304248340427876, "reward_before_mean": 0.3132213608769234, "reward_before_std": 0.860728744417429, "reward_change_max": 0.0034469887614250183, "reward_change_mean": -0.4475955702364445, "reward_change_min": -0.9456360414624214, "reward_change_std": 0.3944186642765999, "reward_std": 0.8304248489439487, "rewards/cosine_scaled_reward": -0.07255600206553936, "rewards/format_reward": 0.45833333395421505, "step": 48 }, { "advantage_max": 1.5654122084379196, "advantage_mean": 1.117587122845265e-08, "advantage_min": -0.716593436896801, "advantage_std": 0.8443465456366539, "completion_length": 2267.2292137145996, "epoch": 0.056, "grad_norm": 0.11271828413009644, "kl": 0.0002801865339279175, "lambda_div_used": 0.5, "learning_rate": 9.8e-07, "loss": 0.0784, "reward": -0.05247187614440918, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.05247187614440918, "reward_after_std": 0.8443465456366539, "reward_before_mean": 0.464739790186286, "reward_before_std": 0.8581503331661224, "reward_change_max": 0.0022785961627960205, "reward_change_mean": -0.5172116560861468, "reward_change_min": -1.1284822821617126, "reward_change_std": 0.44013802148401737, "reward_std": 0.8443465568125248, "rewards/cosine_scaled_reward": -0.059296777937561274, "rewards/format_reward": 0.5833333376795053, "step": 49 }, { "advantage_max": 1.1785499043762684, "advantage_mean": -3.725290298461914e-09, "advantage_min": -0.5394556485116482, "advantage_std": 0.6275908350944519, "completion_length": 3042.1041984558105, "epoch": 0.05714285714285714, "grad_norm": 0.12018999457359314, "kl": 0.00027485471218824387, "lambda_div_used": 0.5, "learning_rate": 1e-06, "loss": 0.0332, "reward": -0.20314022013917565, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.20314022013917565, "reward_after_std": 0.6275908537209034, "reward_before_mean": 0.25351108238101006, "reward_before_std": 0.5844974275678396, "reward_change_max": 0.0004190877079963684, "reward_change_mean": -0.45665127877146006, "reward_change_min": -0.8546877354383469, "reward_change_std": 0.3474463615566492, "reward_std": 0.6275908723473549, "rewards/cosine_scaled_reward": -0.019077795557677746, "rewards/format_reward": 0.2916666716337204, "step": 50 }, { "advantage_max": 1.1401083320379257, "advantage_mean": 8.69234451084111e-09, "advantage_min": -0.6032663956284523, "advantage_std": 0.6303308606147766, "completion_length": 2341.1250534057617, "epoch": 0.05828571428571429, "grad_norm": 0.10779228806495667, "kl": 0.000632166862487793, "lambda_div_used": 0.5, "learning_rate": 9.999890338174275e-07, "loss": 0.0377, "reward": -0.12168696755543351, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12168696755543351, "reward_after_std": 0.6303308419883251, "reward_before_mean": 0.4071807861328125, "reward_before_std": 0.6401997022330761, "reward_change_max": 0.0, "reward_change_mean": -0.5288677718490362, "reward_change_min": -0.9466118700802326, "reward_change_std": 0.3940194044262171, "reward_std": 0.6303308643400669, "rewards/cosine_scaled_reward": -0.07765959948301315, "rewards/format_reward": 0.5625000074505806, "step": 51 }, { "advantage_max": 1.657617561519146, "advantage_mean": -6.208817349140361e-10, "advantage_min": -0.8769890516996384, "advantage_std": 0.8941908292472363, "completion_length": 2828.6042251586914, "epoch": 0.05942857142857143, "grad_norm": 0.10411342233419418, "kl": 0.0005970411002635956, "lambda_div_used": 0.5, "learning_rate": 9.999561358041868e-07, "loss": 0.0157, "reward": 0.11933974362909794, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11933974362909794, "reward_after_std": 0.8941908441483974, "reward_before_mean": 0.7608568891882896, "reward_before_std": 0.8951385281980038, "reward_change_max": 0.000269867479801178, "reward_change_mean": -0.6415171585977077, "reward_change_min": -1.175170797854662, "reward_change_std": 0.47852752916514874, "reward_std": 0.8941908627748489, "rewards/cosine_scaled_reward": 0.13042843155562878, "rewards/format_reward": 0.5000000149011612, "step": 52 }, { "advantage_max": 1.5903307870030403, "advantage_mean": -8.071462054282108e-09, "advantage_min": -0.5871336311101913, "advantage_std": 0.817248024046421, "completion_length": 2820.187530517578, "epoch": 0.060571428571428575, "grad_norm": 0.11234336346387863, "kl": 0.00043966108933091164, "lambda_div_used": 0.5, "learning_rate": 9.999013075636804e-07, "loss": -0.0115, "reward": 0.0432483796030283, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0432483796030283, "reward_after_std": 0.8172480165958405, "reward_before_mean": 0.6357301553944126, "reward_before_std": 0.6847620755434036, "reward_change_max": 0.0, "reward_change_mean": -0.5924818031489849, "reward_change_min": -0.9367171190679073, "reward_change_std": 0.36971727199852467, "reward_std": 0.8172480203211308, "rewards/cosine_scaled_reward": 0.036615074845030904, "rewards/format_reward": 0.5625000055879354, "step": 53 }, { "advantage_max": 2.0655234158039093, "advantage_mean": -5.587935614226325e-09, "advantage_min": -0.9336753264069557, "advantage_std": 1.1151320487260818, "completion_length": 2985.6250610351562, "epoch": 0.061714285714285715, "grad_norm": 0.16910512745380402, "kl": 0.00020236149430274963, "lambda_div_used": 0.5, "learning_rate": 9.998245517681593e-07, "loss": 0.044, "reward": 0.15036086877807975, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.15036086877807975, "reward_after_std": 1.1151320710778236, "reward_before_mean": 0.7494669770821929, "reward_before_std": 1.1877865009009838, "reward_change_max": 0.0017044693231582642, "reward_change_mean": -0.5991061069071293, "reward_change_min": -1.2863488122820854, "reward_change_std": 0.5509431846439838, "reward_std": 1.115132100880146, "rewards/cosine_scaled_reward": 0.10390015179291368, "rewards/format_reward": 0.541666679084301, "step": 54 }, { "advantage_max": 1.346567988395691, "advantage_mean": 5.5879357807597785e-09, "advantage_min": -0.5341115295886993, "advantage_std": 0.6975418590009212, "completion_length": 3084.5834045410156, "epoch": 0.06285714285714286, "grad_norm": 0.10880491882562637, "kl": 0.0010903775691986084, "lambda_div_used": 0.5, "learning_rate": 9.997258721585931e-07, "loss": 0.0464, "reward": -0.2912944480776787, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2912944480776787, "reward_after_std": 0.697541855275631, "reward_before_mean": 0.06275918334722519, "reward_before_std": 0.6657360903918743, "reward_change_max": 0.0007643029093742371, "reward_change_mean": -0.35405362490564585, "reward_change_min": -0.6719070784747601, "reward_change_std": 0.2700015977025032, "reward_std": 0.697541881352663, "rewards/cosine_scaled_reward": -0.10403707949444652, "rewards/format_reward": 0.2708333358168602, "step": 55 }, { "advantage_max": 1.1837703846395016, "advantage_mean": 1.0554989493538613e-08, "advantage_min": -0.6579623222351074, "advantage_std": 0.6466393321752548, "completion_length": 2916.8333892822266, "epoch": 0.064, "grad_norm": 0.0897069051861763, "kl": 0.0005567669868469238, "lambda_div_used": 0.5, "learning_rate": 9.996052735444862e-07, "loss": 0.0412, "reward": -0.1802707426249981, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1802707426249981, "reward_after_std": 0.646639347076416, "reward_before_mean": 0.29126008972525597, "reward_before_std": 0.6771029680967331, "reward_change_max": 0.0011851340532302856, "reward_change_mean": -0.47153082955628633, "reward_change_min": -0.8577333986759186, "reward_change_std": 0.365943206474185, "reward_std": 0.6466393582522869, "rewards/cosine_scaled_reward": -0.07311996631324291, "rewards/format_reward": 0.4375000111758709, "step": 56 }, { "advantage_max": 1.5917630940675735, "advantage_mean": 1.6763806898190126e-08, "advantage_min": -0.5033136904239655, "advantage_std": 0.8053608201444149, "completion_length": 3434.687530517578, "epoch": 0.06514285714285714, "grad_norm": 0.11615301668643951, "kl": 0.000165596604347229, "lambda_div_used": 0.5, "learning_rate": 9.994627618036452e-07, "loss": 0.0071, "reward": -0.34377744421362877, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.34377744421362877, "reward_after_std": 0.805360808968544, "reward_before_mean": -0.07328794337809086, "reward_before_std": 0.7592337466776371, "reward_change_max": 0.00013331323862075806, "reward_change_mean": -0.2704894933849573, "reward_change_min": -0.5694965869188309, "reward_change_std": 0.2236005710437894, "reward_std": 0.8053608499467373, "rewards/cosine_scaled_reward": -0.16164396703243256, "rewards/format_reward": 0.2500000037252903, "step": 57 }, { "advantage_max": 1.8052620589733124, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.7408896759152412, "advantage_std": 0.9437650218605995, "completion_length": 2223.4167251586914, "epoch": 0.06628571428571428, "grad_norm": 0.17151595652103424, "kl": 0.004196107387542725, "lambda_div_used": 0.5, "learning_rate": 9.992983438818915e-07, "loss": 0.0393, "reward": 0.08948480966500938, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08948480966500938, "reward_after_std": 0.9437649697065353, "reward_before_mean": 0.6831958554685116, "reward_before_std": 0.8700358737260103, "reward_change_max": 0.0, "reward_change_mean": -0.5937110781669617, "reward_change_min": -1.0060609802603722, "reward_change_std": 0.41046774573624134, "reward_std": 0.9437649697065353, "rewards/cosine_scaled_reward": -0.02298540365882218, "rewards/format_reward": 0.7291666753590107, "step": 58 }, { "advantage_max": 1.175541877746582, "advantage_mean": -1.6763807009212428e-08, "advantage_min": -0.44569920375943184, "advantage_std": 0.6143423058092594, "completion_length": 2941.7083587646484, "epoch": 0.06742857142857143, "grad_norm": 0.08178117871284485, "kl": 0.000935891643166542, "lambda_div_used": 0.5, "learning_rate": 9.991120277927223e-07, "loss": 0.0342, "reward": -0.2478409237228334, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2478409237228334, "reward_after_std": 0.6143423058092594, "reward_before_mean": 0.16964629292488098, "reward_before_std": 0.5480784140527248, "reward_change_max": 0.000612989068031311, "reward_change_mean": -0.4174872115254402, "reward_change_min": -0.8169004134833813, "reward_change_std": 0.31252420227974653, "reward_std": 0.6143423169851303, "rewards/cosine_scaled_reward": -0.06101020169444382, "rewards/format_reward": 0.2916666679084301, "step": 59 }, { "advantage_max": 1.0423083528876305, "advantage_mean": 1.4901161526914564e-08, "advantage_min": -0.4296622648835182, "advantage_std": 0.5458662435412407, "completion_length": 2984.916679382324, "epoch": 0.06857142857142857, "grad_norm": 0.07768469303846359, "kl": 0.0006950497627258301, "lambda_div_used": 0.5, "learning_rate": 9.989038226169207e-07, "loss": 0.0063, "reward": -0.40611574915237725, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.40611574915237725, "reward_after_std": 0.5458662435412407, "reward_before_mean": -0.0968768410384655, "reward_before_std": 0.5283457562327385, "reward_change_max": 0.002580612897872925, "reward_change_mean": -0.30923892557621, "reward_change_min": -0.6607633791863918, "reward_change_std": 0.24943595007061958, "reward_std": 0.5458662621676922, "rewards/cosine_scaled_reward": -0.2046884261071682, "rewards/format_reward": 0.31250000186264515, "step": 60 }, { "advantage_max": 1.457154531031847, "advantage_mean": 9.313225801665936e-09, "advantage_min": -0.7158756963908672, "advantage_std": 0.7909952085465193, "completion_length": 3087.0208740234375, "epoch": 0.06971428571428571, "grad_norm": 0.16649970412254333, "kl": 0.0007555186748504639, "lambda_div_used": 0.5, "learning_rate": 9.98673738502114e-07, "loss": 0.0935, "reward": -0.13823260087519884, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13823260087519884, "reward_after_std": 0.790995180606842, "reward_before_mean": 0.3234656620770693, "reward_before_std": 0.8312350269407034, "reward_change_max": 0.0023474469780921936, "reward_change_mean": -0.46169828064739704, "reward_change_min": -0.9537935554981232, "reward_change_std": 0.4016226176172495, "reward_std": 0.790995180606842, "rewards/cosine_scaled_reward": -0.0674338429234922, "rewards/format_reward": 0.45833334513008595, "step": 61 }, { "advantage_max": 2.0991614311933517, "advantage_mean": -1.738468857759301e-08, "advantage_min": -0.7645231634378433, "advantage_std": 1.0837591513991356, "completion_length": 2622.6458740234375, "epoch": 0.07085714285714285, "grad_norm": 0.16811510920524597, "kl": 0.015173658728599548, "lambda_div_used": 0.5, "learning_rate": 9.98421786662277e-07, "loss": 0.0233, "reward": 0.20936205855105072, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20936205855105072, "reward_after_std": 1.0837591513991356, "reward_before_mean": 0.8541646478697658, "reward_before_std": 0.9757980816066265, "reward_change_max": 0.0, "reward_change_mean": -0.6448025852441788, "reward_change_min": -1.1601449958980083, "reward_change_std": 0.456626171246171, "reward_std": 1.0837591886520386, "rewards/cosine_scaled_reward": 0.0937489839270711, "rewards/format_reward": 0.6666666809469461, "step": 62 }, { "advantage_max": 1.5456999205052853, "advantage_mean": -6.20881729362921e-09, "advantage_min": -0.6523657143115997, "advantage_std": 0.7971435803920031, "completion_length": 2413.3958740234375, "epoch": 0.072, "grad_norm": 0.12558011710643768, "kl": 0.0017941594123840332, "lambda_div_used": 0.5, "learning_rate": 9.981479793771866e-07, "loss": 0.0628, "reward": 0.025665222201496363, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.025665222201496363, "reward_after_std": 0.7971435710787773, "reward_before_mean": 0.6093635726720095, "reward_before_std": 0.6957045514136553, "reward_change_max": 0.0003859475255012512, "reward_change_mean": -0.5836983378976583, "reward_change_min": -0.9405001066625118, "reward_change_std": 0.3708849251270294, "reward_std": 0.7971436083316803, "rewards/cosine_scaled_reward": 0.013015098869800568, "rewards/format_reward": 0.5833333469927311, "step": 63 }, { "advantage_max": 1.5050344467163086, "advantage_mean": 1.8005570256995895e-08, "advantage_min": -0.6849210783839226, "advantage_std": 0.8289222978055477, "completion_length": 3141.7500610351562, "epoch": 0.07314285714285715, "grad_norm": 0.16174478828907013, "kl": 0.0019003748893737793, "lambda_div_used": 0.5, "learning_rate": 9.97852329991824e-07, "loss": 0.1196, "reward": -0.21013362589292228, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.21013362589292228, "reward_after_std": 0.8289222978055477, "reward_before_mean": 0.1825589847867377, "reward_before_std": 0.923391830176115, "reward_change_max": 0.0005897730588912964, "reward_change_mean": -0.39269259478896856, "reward_change_min": -0.9992674365639687, "reward_change_std": 0.4153337189927697, "reward_std": 0.8289223089814186, "rewards/cosine_scaled_reward": -0.05455384007655084, "rewards/format_reward": 0.29166667722165585, "step": 64 }, { "advantage_max": 1.3541830480098724, "advantage_mean": -1.2417631367611648e-09, "advantage_min": -0.5969244241714478, "advantage_std": 0.7039464600384235, "completion_length": 2772.8333587646484, "epoch": 0.07428571428571429, "grad_norm": 0.09448474645614624, "kl": 0.003227710723876953, "lambda_div_used": 0.5, "learning_rate": 9.975348529157229e-07, "loss": 0.0235, "reward": -0.26149132908903994, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.26149132908903994, "reward_after_std": 0.7039464488625526, "reward_before_mean": 0.11393103189766407, "reward_before_std": 0.6785321645438671, "reward_change_max": 0.002308964729309082, "reward_change_mean": -0.3754223734140396, "reward_change_min": -0.7083380743861198, "reward_change_std": 0.293591745197773, "reward_std": 0.7039464600384235, "rewards/cosine_scaled_reward": -0.15136783104389906, "rewards/format_reward": 0.41666666977107525, "step": 65 }, { "advantage_max": 1.297847893089056, "advantage_mean": 5.5879357807597785e-09, "advantage_min": -0.4594452455639839, "advantage_std": 0.6709181014448404, "completion_length": 2317.3125038146973, "epoch": 0.07542857142857143, "grad_norm": 0.09020698070526123, "kl": 0.002762317657470703, "lambda_div_used": 0.5, "learning_rate": 9.971955636222684e-07, "loss": -0.0007, "reward": -0.0659542977809906, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0659542977809906, "reward_after_std": 0.6709181014448404, "reward_before_mean": 0.478666216135025, "reward_before_std": 0.5349580403417349, "reward_change_max": 0.0030061379075050354, "reward_change_mean": -0.5446205204352736, "reward_change_min": -0.9633432440459728, "reward_change_std": 0.36170417233370245, "reward_std": 0.6709181163460016, "rewards/cosine_scaled_reward": 0.010166438878513873, "rewards/format_reward": 0.4583333358168602, "step": 66 }, { "advantage_max": 1.126870758831501, "advantage_mean": 1.8626452047421083e-08, "advantage_min": -0.4127872511744499, "advantage_std": 0.5886235721409321, "completion_length": 3524.2916870117188, "epoch": 0.07657142857142857, "grad_norm": 0.10760419815778732, "kl": 0.0019077062606811523, "lambda_div_used": 0.5, "learning_rate": 9.968344786479415e-07, "loss": 0.0454, "reward": -0.5646808911114931, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5646808911114931, "reward_after_std": 0.5886235684156418, "reward_before_mean": -0.4027576297521591, "reward_before_std": 0.6220518052577972, "reward_change_max": 0.0007558688521385193, "reward_change_mean": -0.16192326415330172, "reward_change_min": -0.5088565908372402, "reward_change_std": 0.20086207846179605, "reward_std": 0.5886235684156418, "rewards/cosine_scaled_reward": -0.24304548278450966, "rewards/format_reward": 0.0833333358168602, "step": 67 }, { "advantage_max": 1.5252962484955788, "advantage_mean": 2.4835271617007493e-09, "advantage_min": -0.7342267334461212, "advantage_std": 0.835030809044838, "completion_length": 2333.3541946411133, "epoch": 0.07771428571428571, "grad_norm": 0.15104389190673828, "kl": 0.004139900207519531, "lambda_div_used": 0.5, "learning_rate": 9.964516155915151e-07, "loss": 0.0705, "reward": -0.011033048038370907, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.011033048038370907, "reward_after_std": 0.8350308053195477, "reward_before_mean": 0.5420394577085972, "reward_before_std": 0.8811622634530067, "reward_change_max": 0.0, "reward_change_mean": -0.5530724860727787, "reward_change_min": -1.1742777340114117, "reward_change_std": 0.4708987697958946, "reward_std": 0.8350308425724506, "rewards/cosine_scaled_reward": -0.010230285115540028, "rewards/format_reward": 0.5625000074505806, "step": 68 }, { "advantage_max": 1.0369067564606667, "advantage_mean": 2.4214387495113954e-08, "advantage_min": -0.382175724953413, "advantage_std": 0.5327570177614689, "completion_length": 2721.0833740234375, "epoch": 0.07885714285714286, "grad_norm": 0.07868275046348572, "kl": 0.003982067108154297, "lambda_div_used": 0.5, "learning_rate": 9.960469931131936e-07, "loss": 0.0352, "reward": -0.43468883633613586, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.43468883633613586, "reward_after_std": 0.5327570252120495, "reward_before_mean": -0.1535684810951352, "reward_before_std": 0.4849686697125435, "reward_change_max": 0.0007972046732902527, "reward_change_mean": -0.2811203598976135, "reward_change_min": -0.5471938513219357, "reward_change_std": 0.20546625927090645, "reward_std": 0.5327570587396622, "rewards/cosine_scaled_reward": -0.24345091171562672, "rewards/format_reward": 0.3333333358168602, "step": 69 }, { "advantage_max": 1.3918376974761486, "advantage_mean": -6.208819014474898e-10, "advantage_min": -0.49868446588516235, "advantage_std": 0.7050085607916117, "completion_length": 3057.2291870117188, "epoch": 0.08, "grad_norm": 0.1143859475851059, "kl": 0.0013420581817626953, "lambda_div_used": 0.5, "learning_rate": 9.956206309337066e-07, "loss": 0.0225, "reward": -0.2947548748925328, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2947548748925328, "reward_after_std": 0.7050085607916117, "reward_before_mean": 0.04767380794510245, "reward_before_std": 0.6246827412396669, "reward_change_max": 0.0009760409593582153, "reward_change_mean": -0.34242871031165123, "reward_change_min": -0.5683609507977962, "reward_change_std": 0.24109898321330547, "reward_std": 0.7050085626542568, "rewards/cosine_scaled_reward": -0.1532464288175106, "rewards/format_reward": 0.3541666679084301, "step": 70 }, { "advantage_max": 1.0536888763308525, "advantage_mean": 1.862645426786713e-09, "advantage_min": -0.5757294371724129, "advantage_std": 0.5784401223063469, "completion_length": 2708.5625228881836, "epoch": 0.08114285714285714, "grad_norm": 0.10196925699710846, "kl": 0.005338191986083984, "lambda_div_used": 0.5, "learning_rate": 9.951725498333448e-07, "loss": 0.0312, "reward": -0.1822344735264778, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1822344735264778, "reward_after_std": 0.5784401334822178, "reward_before_mean": 0.3108774647116661, "reward_before_std": 0.5767860356718302, "reward_change_max": 0.003112994134426117, "reward_change_mean": -0.4931119680404663, "reward_change_min": -0.8853396885097027, "reward_change_std": 0.36358125135302544, "reward_std": 0.5784401521086693, "rewards/cosine_scaled_reward": -0.042477929033339024, "rewards/format_reward": 0.3958333395421505, "step": 71 }, { "advantage_max": 1.6563706696033478, "advantage_mean": 1.924733389335742e-08, "advantage_min": -0.6117210127413273, "advantage_std": 0.8583654910326004, "completion_length": 3154.312530517578, "epoch": 0.08228571428571428, "grad_norm": 0.145771786570549, "kl": 0.004016876220703125, "lambda_div_used": 0.5, "learning_rate": 9.947027716509488e-07, "loss": 0.0721, "reward": -0.3233127495041117, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3233127495041117, "reward_after_std": 0.8583654500544071, "reward_before_mean": -0.04943877179175615, "reward_before_std": 0.8772860243916512, "reward_change_max": 0.005304671823978424, "reward_change_mean": -0.27387397922575474, "reward_change_min": -0.6673308126628399, "reward_change_std": 0.284614821895957, "reward_std": 0.8583654649555683, "rewards/cosine_scaled_reward": -0.18096939055249095, "rewards/format_reward": 0.3125000111758709, "step": 72 }, { "advantage_max": 1.26315114274621, "advantage_mean": 4.3461718668424965e-09, "advantage_min": -0.4227764904499054, "advantage_std": 0.639428773894906, "completion_length": 3549.5208740234375, "epoch": 0.08342857142857144, "grad_norm": 0.09522831439971924, "kl": 0.0009833574295043945, "lambda_div_used": 0.5, "learning_rate": 9.942113192828444e-07, "loss": 0.0254, "reward": -0.5094889532774687, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.5094889532774687, "reward_after_std": 0.6394287925213575, "reward_before_mean": -0.3250623978674412, "reward_before_std": 0.6202701851725578, "reward_change_max": 0.0, "reward_change_mean": -0.18442655354738235, "reward_change_min": -0.386491771787405, "reward_change_std": 0.1530038034543395, "reward_std": 0.6394288036972284, "rewards/cosine_scaled_reward": -0.2041978659108281, "rewards/format_reward": 0.0833333358168602, "step": 73 }, { "advantage_max": 1.29529245570302, "advantage_mean": 2.048909719665204e-08, "advantage_min": -0.43963947519659996, "advantage_std": 0.6664884742349386, "completion_length": 3175.500030517578, "epoch": 0.08457142857142858, "grad_norm": 0.12290780991315842, "kl": 0.003965795040130615, "lambda_div_used": 0.5, "learning_rate": 9.93698216681727e-07, "loss": 0.0581, "reward": -0.2856890633702278, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2856890633702278, "reward_after_std": 0.6664884742349386, "reward_before_mean": 0.08298108261078596, "reward_before_std": 0.6028281692415476, "reward_change_max": 0.0005967244505882263, "reward_change_mean": -0.3686701231636107, "reward_change_min": -0.7452529221773148, "reward_change_std": 0.27662853663787246, "reward_std": 0.6664884965866804, "rewards/cosine_scaled_reward": -0.08350946195423603, "rewards/format_reward": 0.2500000037252903, "step": 74 }, { "advantage_max": 1.0755420252680779, "advantage_mean": 0.0, "advantage_min": -0.41619032248854637, "advantage_std": 0.5525711067020893, "completion_length": 3297.1666870117188, "epoch": 0.08571428571428572, "grad_norm": 0.09391221404075623, "kl": 0.0034029483795166016, "lambda_div_used": 0.5, "learning_rate": 9.931634888554935e-07, "loss": 0.023, "reward": -0.20981414895504713, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.20981414895504713, "reward_after_std": 0.5525711178779602, "reward_before_mean": 0.2553990473970771, "reward_before_std": 0.43019257858395576, "reward_change_max": 0.0018067359924316406, "reward_change_mean": -0.46521320194005966, "reward_change_min": -0.7154612056910992, "reward_change_std": 0.28662352077662945, "reward_std": 0.5525711290538311, "rewards/cosine_scaled_reward": -0.04938381724059582, "rewards/format_reward": 0.35416666977107525, "step": 75 }, { "advantage_max": 1.1918527409434319, "advantage_mean": 1.691902684619606e-08, "advantage_min": -0.5500158071517944, "advantage_std": 0.6363635919988155, "completion_length": 3000.937530517578, "epoch": 0.08685714285714285, "grad_norm": 0.10816562920808792, "kl": 0.0012085437774658203, "lambda_div_used": 0.5, "learning_rate": 9.926071618660237e-07, "loss": 0.0431, "reward": -0.2598666944541037, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2598666944541037, "reward_after_std": 0.6363636236637831, "reward_before_mean": 0.14593160874210298, "reward_before_std": 0.6384630594402552, "reward_change_max": 0.0007710233330726624, "reward_change_mean": -0.4057982945814729, "reward_change_min": -0.7503902576863766, "reward_change_std": 0.3121058507822454, "reward_std": 0.6363636441528797, "rewards/cosine_scaled_reward": -0.1457842094823718, "rewards/format_reward": 0.43750001676380634, "step": 76 }, { "advantage_max": 0.9090688228607178, "advantage_mean": -3.725290242950763e-09, "advantage_min": -0.5261832289397717, "advantage_std": 0.5043393410742283, "completion_length": 3174.2083740234375, "epoch": 0.088, "grad_norm": 0.11574254930019379, "kl": 0.0021175146102905273, "lambda_div_used": 0.5, "learning_rate": 9.9202926282791e-07, "loss": 0.0272, "reward": -0.34673725441098213, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.34673725441098213, "reward_after_std": 0.5043393447995186, "reward_before_mean": 0.03295615315437317, "reward_before_std": 0.5356946103274822, "reward_change_max": 0.0012821629643440247, "reward_change_mean": -0.37969343923032284, "reward_change_min": -0.6977962926030159, "reward_change_std": 0.30058483220636845, "reward_std": 0.5043393522500992, "rewards/cosine_scaled_reward": -0.12935525551438332, "rewards/format_reward": 0.29166667722165585, "step": 77 }, { "advantage_max": 1.8381619974970818, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.7710757628083229, "advantage_std": 0.9732430391013622, "completion_length": 3186.3958587646484, "epoch": 0.08914285714285715, "grad_norm": 0.15152576565742493, "kl": 0.0022534728050231934, "lambda_div_used": 0.5, "learning_rate": 9.91429819907136e-07, "loss": 0.0564, "reward": -0.15935248951427639, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.15935248951427639, "reward_after_std": 0.9732430167496204, "reward_before_mean": 0.22409775853157043, "reward_before_std": 1.021384745836258, "reward_change_max": 0.003303632140159607, "reward_change_mean": -0.38345025200396776, "reward_change_min": -0.810688778758049, "reward_change_std": 0.3610359411686659, "reward_std": 0.9732430540025234, "rewards/cosine_scaled_reward": -0.03378446213901043, "rewards/format_reward": 0.29166667349636555, "step": 78 }, { "advantage_max": 1.4568165950477123, "advantage_mean": -4.346172088887101e-09, "advantage_min": -0.5802691504359245, "advantage_std": 0.7698210962116718, "completion_length": 2612.4583740234375, "epoch": 0.09028571428571429, "grad_norm": 0.10133597254753113, "kl": 0.004894614219665527, "lambda_div_used": 0.5, "learning_rate": 9.908088623197048e-07, "loss": 0.019, "reward": -0.19068989902734756, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19068989902734756, "reward_after_std": 0.7698210999369621, "reward_before_mean": 0.2276688851416111, "reward_before_std": 0.7653483748435974, "reward_change_max": 0.0016639381647109985, "reward_change_mean": -0.41835877299308777, "reward_change_min": -0.8724069856107235, "reward_change_std": 0.3391466625034809, "reward_std": 0.7698211185634136, "rewards/cosine_scaled_reward": -0.12574889697134495, "rewards/format_reward": 0.4791666679084301, "step": 79 }, { "advantage_max": 1.4858717247843742, "advantage_mean": 8.692344288796505e-09, "advantage_min": -0.59293033182621, "advantage_std": 0.8019508235156536, "completion_length": 3314.4791870117188, "epoch": 0.09142857142857143, "grad_norm": 0.1212073266506195, "kl": 0.003013134002685547, "lambda_div_used": 0.5, "learning_rate": 9.901664203302124e-07, "loss": 0.0403, "reward": -0.3154444256797433, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3154444256797433, "reward_after_std": 0.8019508309662342, "reward_before_mean": -0.0058077238500118256, "reward_before_std": 0.8834160603582859, "reward_change_max": 0.004029706120491028, "reward_change_mean": -0.30963670555502176, "reward_change_min": -0.9396452568471432, "reward_change_std": 0.37205731589347124, "reward_std": 0.8019508682191372, "rewards/cosine_scaled_reward": -0.13832053111400455, "rewards/format_reward": 0.2708333358168602, "step": 80 }, { "advantage_max": 1.2512575164437294, "advantage_mean": -1.862645149230957e-09, "advantage_min": -0.5687988735735416, "advantage_std": 0.6636236608028412, "completion_length": 3201.5833740234375, "epoch": 0.09257142857142857, "grad_norm": 0.1467030793428421, "kl": 0.009012222290039062, "lambda_div_used": 0.5, "learning_rate": 9.895025252503755e-07, "loss": 0.0302, "reward": -0.29607150983065367, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.29607150983065367, "reward_after_std": 0.6636236682534218, "reward_before_mean": 0.06822922918945551, "reward_before_std": 0.6745604537427425, "reward_change_max": 0.0014805421233177185, "reward_change_mean": -0.36430073343217373, "reward_change_min": -0.7246239744126797, "reward_change_std": 0.29929828830063343, "reward_std": 0.6636237092316151, "rewards/cosine_scaled_reward": -0.1325520584359765, "rewards/format_reward": 0.3333333395421505, "step": 81 }, { "advantage_max": 1.4132002517580986, "advantage_mean": 3.725290520506519e-09, "advantage_min": -0.6526295244693756, "advantage_std": 0.7535315006971359, "completion_length": 2793.104202270508, "epoch": 0.09371428571428571, "grad_norm": 0.10430161654949188, "kl": 0.006566286087036133, "lambda_div_used": 0.5, "learning_rate": 9.888172094375033e-07, "loss": 0.0376, "reward": -0.020408831536769867, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.020408831536769867, "reward_after_std": 0.7535315155982971, "reward_before_mean": 0.5454386062920094, "reward_before_std": 0.7008488159626722, "reward_change_max": 0.0014094933867454529, "reward_change_mean": -0.565847460180521, "reward_change_min": -0.9649605080485344, "reward_change_std": 0.40595896914601326, "reward_std": 0.7535315193235874, "rewards/cosine_scaled_reward": 0.05396931990981102, "rewards/format_reward": 0.4375000074505806, "step": 82 }, { "advantage_max": 1.1509655825793743, "advantage_mean": 1.4280280014045132e-08, "advantage_min": -0.4061584994196892, "advantage_std": 0.5890420638024807, "completion_length": 2931.104179382324, "epoch": 0.09485714285714286, "grad_norm": 0.08529522269964218, "kl": 0.003251791000366211, "lambda_div_used": 0.5, "learning_rate": 9.881105062929221e-07, "loss": -0.0002, "reward": -0.3930655550211668, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3930655550211668, "reward_after_std": 0.5890420712530613, "reward_before_mean": -0.09176901169121265, "reward_before_std": 0.5422711819410324, "reward_change_max": 0.0007033348083496094, "reward_change_mean": -0.3012965607922524, "reward_change_min": -0.5879498608410358, "reward_change_std": 0.22128173056989908, "reward_std": 0.5890420861542225, "rewards/cosine_scaled_reward": -0.181301174685359, "rewards/format_reward": 0.27083333395421505, "step": 83 }, { "advantage_max": 1.3313812613487244, "advantage_mean": 1.3659397946064189e-08, "advantage_min": -0.5420710518956184, "advantage_std": 0.708248311188072, "completion_length": 3138.3958587646484, "epoch": 0.096, "grad_norm": 0.10056845843791962, "kl": 0.0016238689422607422, "lambda_div_used": 0.5, "learning_rate": 9.873824502603459e-07, "loss": 0.0243, "reward": -0.1541997790336609, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1541997790336609, "reward_after_std": 0.7082483097910881, "reward_before_mean": 0.31638404354453087, "reward_before_std": 0.6747408444061875, "reward_change_max": 0.0010876953601837158, "reward_change_mean": -0.4705838493537158, "reward_change_min": -0.8658628650009632, "reward_change_std": 0.35376761644147336, "reward_std": 0.7082483172416687, "rewards/cosine_scaled_reward": -0.008474626112729311, "rewards/format_reward": 0.33333334140479565, "step": 84 }, { "advantage_max": 1.905438233166933, "advantage_mean": 9.313225857177088e-09, "advantage_min": -0.6710572019219398, "advantage_std": 0.9832526743412018, "completion_length": 3167.2500610351562, "epoch": 0.09714285714285714, "grad_norm": 0.16388022899627686, "kl": 0.0030014514923095703, "lambda_div_used": 0.5, "learning_rate": 9.866330768241983e-07, "loss": 0.0552, "reward": -0.16078246012330055, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.16078246012330055, "reward_after_std": 0.9832526911050081, "reward_before_mean": 0.20949235220905393, "reward_before_std": 0.968382814899087, "reward_change_max": 0.0020551979541778564, "reward_change_mean": -0.3702748082578182, "reward_change_min": -0.7871052846312523, "reward_change_std": 0.33008442260324955, "reward_std": 0.9832527544349432, "rewards/cosine_scaled_reward": -0.08275382965803146, "rewards/format_reward": 0.37500000558793545, "step": 85 }, { "advantage_max": 1.339600756764412, "advantage_mean": 4.967053768289986e-09, "advantage_min": -0.5849123671650887, "advantage_std": 0.6978941671550274, "completion_length": 3120.5, "epoch": 0.09828571428571428, "grad_norm": 0.14190861582756042, "kl": 0.003871917724609375, "lambda_div_used": 0.5, "learning_rate": 9.85862422507884e-07, "loss": 0.0345, "reward": -0.2158273388631642, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2158273388631642, "reward_after_std": 0.6978941857814789, "reward_before_mean": 0.2014560904353857, "reward_before_std": 0.6507901139557362, "reward_change_max": 0.0007161274552345276, "reward_change_mean": -0.41728341206908226, "reward_change_min": -0.740577656775713, "reward_change_std": 0.3097661882638931, "reward_std": 0.697894211858511, "rewards/cosine_scaled_reward": -0.08677197038196027, "rewards/format_reward": 0.3750000074505806, "step": 86 }, { "advantage_max": 1.4886211231350899, "advantage_mean": 9.934107758624577e-09, "advantage_min": -0.7102038748562336, "advantage_std": 0.7915925346314907, "completion_length": 2863.166717529297, "epoch": 0.09942857142857142, "grad_norm": 0.13210627436637878, "kl": 0.007967233657836914, "lambda_div_used": 0.5, "learning_rate": 9.850705248720068e-07, "loss": -0.0141, "reward": -0.13769594300538301, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13769594300538301, "reward_after_std": 0.7915925309062004, "reward_before_mean": 0.3190428577363491, "reward_before_std": 0.7937264069914818, "reward_change_max": 0.0012602433562278748, "reward_change_mean": -0.4567387877032161, "reward_change_min": -0.8872962296009064, "reward_change_std": 0.36563306488096714, "reward_std": 0.7915925495326519, "rewards/cosine_scaled_reward": -0.10089525021612644, "rewards/format_reward": 0.5208333507180214, "step": 87 }, { "advantage_max": 2.120447114109993, "advantage_mean": -5.587935447692871e-09, "advantage_min": -0.865127693861723, "advantage_std": 1.1003176234662533, "completion_length": 3047.0000610351562, "epoch": 0.10057142857142858, "grad_norm": 0.22280767560005188, "kl": 0.009730815887451172, "lambda_div_used": 0.5, "learning_rate": 9.8425742251254e-07, "loss": 0.0637, "reward": -0.025142807513475418, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.025142807513475418, "reward_after_std": 1.100317608565092, "reward_before_mean": 0.4224190632812679, "reward_before_std": 1.0945978332310915, "reward_change_max": 0.0009044483304023743, "reward_change_mean": -0.447561863809824, "reward_change_min": -0.8933478519320488, "reward_change_std": 0.3938889466226101, "reward_std": 1.1003176383674145, "rewards/cosine_scaled_reward": -0.01795714534819126, "rewards/format_reward": 0.4583333469927311, "step": 88 }, { "advantage_max": 1.064413994550705, "advantage_mean": -8.692344843908018e-09, "advantage_min": -0.6095254570245743, "advantage_std": 0.5904275216162205, "completion_length": 3318.4583435058594, "epoch": 0.10171428571428572, "grad_norm": 0.0975460335612297, "kl": 0.004989147186279297, "lambda_div_used": 0.5, "learning_rate": 9.83423155058946e-07, "loss": 0.0134, "reward": -0.29414668679237366, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.29414668679237366, "reward_after_std": 0.5904275141656399, "reward_before_mean": 0.10153248719871044, "reward_before_std": 0.6402346752583981, "reward_change_max": 0.001469351351261139, "reward_change_mean": -0.39567921683192253, "reward_change_min": -0.7922942489385605, "reward_change_std": 0.335257139056921, "reward_std": 0.590427540242672, "rewards/cosine_scaled_reward": -0.06381708104163408, "rewards/format_reward": 0.22916666977107525, "step": 89 }, { "advantage_max": 1.1351367458701134, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.5059534460306168, "advantage_std": 0.5984435714781284, "completion_length": 2699.354202270508, "epoch": 0.10285714285714286, "grad_norm": 0.10836907476186752, "kl": 0.016954421997070312, "lambda_div_used": 0.5, "learning_rate": 9.825677631722435e-07, "loss": 0.0272, "reward": -0.37371181324124336, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.37371181324124336, "reward_after_std": 0.5984435267746449, "reward_before_mean": -0.06075846217572689, "reward_before_std": 0.6041407622396946, "reward_change_max": 0.0012149438261985779, "reward_change_mean": -0.31295335572212934, "reward_change_min": -0.637012179940939, "reward_change_std": 0.257972976192832, "reward_std": 0.5984435491263866, "rewards/cosine_scaled_reward": -0.2282959033473162, "rewards/format_reward": 0.39583334513008595, "step": 90 }, { "advantage_max": 1.5629900321364403, "advantage_mean": -8.071462387349015e-09, "advantage_min": -0.6664563044905663, "advantage_std": 0.8228764943778515, "completion_length": 3135.5834045410156, "epoch": 0.104, "grad_norm": 0.15579403936862946, "kl": 0.004920244216918945, "lambda_div_used": 0.5, "learning_rate": 9.816912885430258e-07, "loss": 0.0272, "reward": -0.1463136593811214, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1463136593811214, "reward_after_std": 0.8228764794766903, "reward_before_mean": 0.29082170128822327, "reward_before_std": 0.8140651769936085, "reward_change_max": 0.00046353042125701904, "reward_change_mean": -0.4371353592723608, "reward_change_min": -0.8551024608314037, "reward_change_std": 0.35122772585600615, "reward_std": 0.8228764906525612, "rewards/cosine_scaled_reward": -0.07333916798233986, "rewards/format_reward": 0.43750000558793545, "step": 91 }, { "advantage_max": 1.1421417072415352, "advantage_mean": 3.725290520506519e-09, "advantage_min": -0.6064575128257275, "advantage_std": 0.6331437919288874, "completion_length": 3024.729202270508, "epoch": 0.10514285714285715, "grad_norm": 0.19823330640792847, "kl": 0.09358072280883789, "lambda_div_used": 0.5, "learning_rate": 9.807937738894303e-07, "loss": 0.0235, "reward": -0.25559084489941597, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.25559084489941597, "reward_after_std": 0.633143799379468, "reward_before_mean": 0.1625400371849537, "reward_before_std": 0.6757265739142895, "reward_change_max": 0.0014690980315208435, "reward_change_mean": -0.418130905367434, "reward_change_min": -0.8377382159233093, "reward_change_std": 0.363980152644217, "reward_std": 0.6331438161432743, "rewards/cosine_scaled_reward": -0.1270633153617382, "rewards/format_reward": 0.4166666753590107, "step": 92 }, { "advantage_max": 0.7856199890375137, "advantage_mean": 2.297262391426358e-08, "advantage_min": -0.33980754390358925, "advantage_std": 0.41527827456593513, "completion_length": 3444.3958740234375, "epoch": 0.10628571428571429, "grad_norm": 0.07124119251966476, "kl": 0.005066871643066406, "lambda_div_used": 0.5, "learning_rate": 9.798752629550546e-07, "loss": 0.0207, "reward": -0.6138800643384457, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.6138800643384457, "reward_after_std": 0.41527827456593513, "reward_before_mean": -0.436697356402874, "reward_before_std": 0.4258286878466606, "reward_change_max": 0.002537757158279419, "reward_change_mean": -0.17718270793557167, "reward_change_min": -0.4357273578643799, "reward_change_std": 0.1735476478934288, "reward_std": 0.41527828946709633, "rewards/cosine_scaled_reward": -0.27043201215565205, "rewards/format_reward": 0.1041666679084301, "step": 93 }, { "advantage_max": 1.3759141564369202, "advantage_mean": -6.208819014474898e-10, "advantage_min": -0.5324600636959076, "advantage_std": 0.7108637019991875, "completion_length": 3188.0416870117188, "epoch": 0.10742857142857143, "grad_norm": 0.10782060027122498, "kl": 0.00946807861328125, "lambda_div_used": 0.5, "learning_rate": 9.78935800506826e-07, "loss": 0.0183, "reward": -0.3538913428783417, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3538913428783417, "reward_after_std": 0.7108637019991875, "reward_before_mean": -0.05700744315981865, "reward_before_std": 0.7008998841047287, "reward_change_max": 0.00010148435831069946, "reward_change_mean": -0.2968839295208454, "reward_change_min": -0.6375745087862015, "reward_change_std": 0.2521350774914026, "reward_std": 0.7108637318015099, "rewards/cosine_scaled_reward": -0.16392040066421032, "rewards/format_reward": 0.27083333767950535, "step": 94 }, { "advantage_max": 1.2005148455500603, "advantage_mean": 2.7318796669284495e-08, "advantage_min": -0.5142731741070747, "advantage_std": 0.6438871584832668, "completion_length": 3527.0625, "epoch": 0.10857142857142857, "grad_norm": 0.10404420644044876, "kl": 0.002773284912109375, "lambda_div_used": 0.5, "learning_rate": 9.779754323328192e-07, "loss": 0.0085, "reward": -0.41304099559783936, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.41304099559783936, "reward_after_std": 0.6438871622085571, "reward_before_mean": -0.13589461334049702, "reward_before_std": 0.6911821234971285, "reward_change_max": 0.0043373703956604, "reward_change_mean": -0.277146372012794, "reward_change_min": -0.6644494272768497, "reward_change_std": 0.27538349106907845, "reward_std": 0.6438871771097183, "rewards/cosine_scaled_reward": -0.16169731132686138, "rewards/format_reward": 0.1875000037252903, "step": 95 }, { "advantage_max": 1.451909989118576, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.6129937767982483, "advantage_std": 0.7681609587743878, "completion_length": 3130.5208740234375, "epoch": 0.10971428571428571, "grad_norm": 0.13211211562156677, "kl": 0.007869243621826172, "lambda_div_used": 0.5, "learning_rate": 9.769942052400235e-07, "loss": 0.0646, "reward": -0.2191711962223053, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2191711962223053, "reward_after_std": 0.7681609885767102, "reward_before_mean": 0.17698761075735092, "reward_before_std": 0.7817029012367129, "reward_change_max": 0.0002824366092681885, "reward_change_mean": -0.39615882001817226, "reward_change_min": -0.8239215202629566, "reward_change_std": 0.3346262890845537, "reward_std": 0.7681610230356455, "rewards/cosine_scaled_reward": -0.06775618996471167, "rewards/format_reward": 0.3125000037252903, "step": 96 }, { "advantage_max": 1.2568126060068607, "advantage_mean": 6.829699139565548e-09, "advantage_min": -0.5489920303225517, "advantage_std": 0.6930233538150787, "completion_length": 3283.2083435058594, "epoch": 0.11085714285714286, "grad_norm": 0.1216772198677063, "kl": 0.00519561767578125, "lambda_div_used": 0.5, "learning_rate": 9.759921670520634e-07, "loss": 0.0276, "reward": -0.2372879907488823, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2372879907488823, "reward_after_std": 0.6930233649909496, "reward_before_mean": 0.17726046219468117, "reward_before_std": 0.727190125733614, "reward_change_max": 0.0016672611236572266, "reward_change_mean": -0.41454842686653137, "reward_change_min": -0.9322738572955132, "reward_change_std": 0.39632497169077396, "reward_std": 0.693023394793272, "rewards/cosine_scaled_reward": -0.057203130796551704, "rewards/format_reward": 0.2916666679084301, "step": 97 }, { "advantage_max": 1.1389856860041618, "advantage_mean": 1.0554989604560916e-08, "advantage_min": -0.5510528981685638, "advantage_std": 0.6010327264666557, "completion_length": 3024.937530517578, "epoch": 0.112, "grad_norm": 0.10726859420537949, "kl": 0.004727840423583984, "lambda_div_used": 0.5, "learning_rate": 9.749693666068663e-07, "loss": 0.0654, "reward": -0.31532357865944505, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.31532357865944505, "reward_after_std": 0.6010327264666557, "reward_before_mean": 0.05080095771700144, "reward_before_std": 0.5885812118649483, "reward_change_max": 0.0008204132318496704, "reward_change_mean": -0.3661245256662369, "reward_change_min": -0.6775789931416512, "reward_change_std": 0.2766888150945306, "reward_std": 0.6010327376425266, "rewards/cosine_scaled_reward": -0.15168285369873047, "rewards/format_reward": 0.35416668094694614, "step": 98 }, { "advantage_max": 1.061452217400074, "advantage_mean": 1.3659397946064189e-08, "advantage_min": -0.5463209822773933, "advantage_std": 0.5901384837925434, "completion_length": 2968.5416870117188, "epoch": 0.11314285714285714, "grad_norm": 0.12127479910850525, "kl": 0.008264541625976562, "lambda_div_used": 0.5, "learning_rate": 9.739258537542835e-07, "loss": 0.0124, "reward": -0.3719342704862356, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3719342704862356, "reward_after_std": 0.5901384837925434, "reward_before_mean": -0.04033519048243761, "reward_before_std": 0.6591959446668625, "reward_change_max": 0.0008135139942169189, "reward_change_mean": -0.3315990660339594, "reward_change_min": -0.7541460432112217, "reward_change_std": 0.3205462880432606, "reward_std": 0.5901384949684143, "rewards/cosine_scaled_reward": -0.15558426547795534, "rewards/format_reward": 0.2708333358168602, "step": 99 }, { "advantage_max": 1.375246461480856, "advantage_mean": 1.3659398168108794e-08, "advantage_min": -0.6167638674378395, "advantage_std": 0.7283363081514835, "completion_length": 2946.0208740234375, "epoch": 0.11428571428571428, "grad_norm": 0.08893745392560959, "kl": 0.009603500366210938, "lambda_div_used": 0.5, "learning_rate": 9.728616793536587e-07, "loss": 0.0463, "reward": -0.23348002135753632, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.23348002135753632, "reward_after_std": 0.7283363118767738, "reward_before_mean": 0.16290868259966373, "reward_before_std": 0.7378918267786503, "reward_change_max": 0.0015308186411857605, "reward_change_mean": -0.39638871792703867, "reward_change_min": -0.7486597262322903, "reward_change_std": 0.31783370301127434, "reward_std": 0.728336326777935, "rewards/cosine_scaled_reward": -0.08521231822669506, "rewards/format_reward": 0.33333333395421505, "step": 100 }, { "advantage_max": 1.0726460739970207, "advantage_mean": 2.8560559472978753e-08, "advantage_min": -0.40579839795827866, "advantage_std": 0.5493858084082603, "completion_length": 2935.854232788086, "epoch": 0.11542857142857142, "grad_norm": 0.0939834713935852, "kl": 0.006221771240234375, "lambda_div_used": 0.5, "learning_rate": 9.717768952713511e-07, "loss": 0.0479, "reward": -0.2715232199989259, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2715232199989259, "reward_after_std": 0.5493858009576797, "reward_before_mean": 0.14305181056261063, "reward_before_std": 0.4525942765176296, "reward_change_max": 0.0, "reward_change_mean": -0.4145750030875206, "reward_change_min": -0.69618084654212, "reward_change_std": 0.25990021973848343, "reward_std": 0.5493858084082603, "rewards/cosine_scaled_reward": -0.09514076914638281, "rewards/format_reward": 0.3333333395421505, "step": 101 }, { "advantage_max": 1.60531947016716, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.7529873549938202, "advantage_std": 0.8508248142898083, "completion_length": 2944.1459045410156, "epoch": 0.11657142857142858, "grad_norm": 0.15680734813213348, "kl": 0.01442718505859375, "lambda_div_used": 0.5, "learning_rate": 9.706715543782064e-07, "loss": 0.0135, "reward": -0.0667097344994545, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0667097344994545, "reward_after_std": 0.8508248310536146, "reward_before_mean": 0.4317157156765461, "reward_before_std": 0.8481930792331696, "reward_change_max": 0.0004781857132911682, "reward_change_mean": -0.4984254566952586, "reward_change_min": -0.9218480922281742, "reward_change_std": 0.3881477224640548, "reward_std": 0.8508248627185822, "rewards/cosine_scaled_reward": -0.04455881821922958, "rewards/format_reward": 0.5208333414047956, "step": 102 }, { "advantage_max": 1.4402986355125904, "advantage_mean": -9.313226190243995e-09, "advantage_min": -0.591527882963419, "advantage_std": 0.7503146361559629, "completion_length": 3138.3958892822266, "epoch": 0.11771428571428572, "grad_norm": 0.14295694231987, "kl": 0.010210037231445312, "lambda_div_used": 0.5, "learning_rate": 9.695457105469804e-07, "loss": 0.0599, "reward": -0.2733981416095048, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2733981416095048, "reward_after_std": 0.7503146324306726, "reward_before_mean": 0.07834052480757236, "reward_before_std": 0.7342335116118193, "reward_change_max": 0.003089629113674164, "reward_change_mean": -0.35173869086429477, "reward_change_min": -0.7178602293133736, "reward_change_std": 0.29554732143878937, "reward_std": 0.7503146436065435, "rewards/cosine_scaled_reward": -0.17957975156605244, "rewards/format_reward": 0.4375000111758709, "step": 103 }, { "advantage_max": 0.9793983772397041, "advantage_mean": 4.967053435223079e-09, "advantage_min": -0.4478438273072243, "advantage_std": 0.5177636370062828, "completion_length": 2624.9166717529297, "epoch": 0.11885714285714286, "grad_norm": 0.11209587007761002, "kl": 0.0073070526123046875, "lambda_div_used": 0.5, "learning_rate": 9.683994186497132e-07, "loss": 0.0123, "reward": -0.28253607312217355, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.28253607312217355, "reward_after_std": 0.5177636258304119, "reward_before_mean": 0.1391938179731369, "reward_before_std": 0.47612931579351425, "reward_change_max": 6.999820470809937e-05, "reward_change_mean": -0.42172990553081036, "reward_change_min": -0.7518970184028149, "reward_change_std": 0.2917201966047287, "reward_std": 0.517763651907444, "rewards/cosine_scaled_reward": -0.13873642776161432, "rewards/format_reward": 0.4166666679084301, "step": 104 }, { "advantage_max": 1.5320292636752129, "advantage_mean": 2.1109978987077227e-08, "advantage_min": -0.7886748686432838, "advantage_std": 0.8270585872232914, "completion_length": 2966.0833892822266, "epoch": 0.12, "grad_norm": 0.1404057890176773, "kl": 0.009779930114746094, "lambda_div_used": 0.5, "learning_rate": 9.672327345550543e-07, "loss": 0.0745, "reward": -0.13192696264013648, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13192696264013648, "reward_after_std": 0.8270585723221302, "reward_before_mean": 0.3213406689465046, "reward_before_std": 0.8794911131262779, "reward_change_max": 0.0026951581239700317, "reward_change_mean": -0.45326761342585087, "reward_change_min": -0.9718286655843258, "reward_change_std": 0.41331026889383793, "reward_std": 0.8270585872232914, "rewards/cosine_scaled_reward": -0.016412993194535375, "rewards/format_reward": 0.35416668094694614, "step": 105 }, { "advantage_max": 1.729747325181961, "advantage_mean": -4.967053546245381e-09, "advantage_min": -0.7270858883857727, "advantage_std": 0.9044813252985477, "completion_length": 2396.541717529297, "epoch": 0.12114285714285715, "grad_norm": 0.09820910543203354, "kl": 0.007421970367431641, "lambda_div_used": 0.5, "learning_rate": 9.66045715125541e-07, "loss": -0.0122, "reward": 0.2950130708049983, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2950130708049983, "reward_after_std": 0.9044813551008701, "reward_before_mean": 1.0742302685976028, "reward_before_std": 0.777211144566536, "reward_change_max": 0.0, "reward_change_mean": -0.7792172282934189, "reward_change_min": -1.333798922598362, "reward_change_std": 0.5166942048817873, "reward_std": 0.9044813700020313, "rewards/cosine_scaled_reward": 0.17253179172985256, "rewards/format_reward": 0.7291666679084301, "step": 106 }, { "advantage_max": 1.1745030768215656, "advantage_mean": 4.3461723664428575e-09, "advantage_min": -0.48186010867357254, "advantage_std": 0.610499557107687, "completion_length": 2884.916702270508, "epoch": 0.12228571428571429, "grad_norm": 0.09980223327875137, "kl": 0.007454872131347656, "lambda_div_used": 0.5, "learning_rate": 9.648384182148252e-07, "loss": 0.0344, "reward": -0.09255528822541237, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.09255528822541237, "reward_after_std": 0.6104995384812355, "reward_before_mean": 0.4557968080043793, "reward_before_std": 0.4952854886651039, "reward_change_max": 0.0, "reward_change_mean": -0.5483520794659853, "reward_change_min": -0.876308511942625, "reward_change_std": 0.3524962291121483, "reward_std": 0.610499557107687, "rewards/cosine_scaled_reward": -0.02210160903632641, "rewards/format_reward": 0.5000000037252903, "step": 107 }, { "advantage_max": 1.6280250921845436, "advantage_mean": -2.4835271617007493e-09, "advantage_min": -0.7997777834534645, "advantage_std": 0.8667001165449619, "completion_length": 2976.479217529297, "epoch": 0.12342857142857143, "grad_norm": 6.980273723602295, "kl": 0.41866397857666016, "lambda_div_used": 0.5, "learning_rate": 9.636109026648554e-07, "loss": 0.072, "reward": -0.14665891602635384, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.14665891602635384, "reward_after_std": 0.8667001202702522, "reward_before_mean": 0.27829501312226057, "reward_before_std": 0.9044629707932472, "reward_change_max": 0.001452334225177765, "reward_change_mean": -0.4249539338052273, "reward_change_min": -0.9039922542870045, "reward_change_std": 0.38425787910819054, "reward_std": 0.866700142621994, "rewards/cosine_scaled_reward": -0.0587691655382514, "rewards/format_reward": 0.3958333469927311, "step": 108 }, { "advantage_max": 1.293064869940281, "advantage_mean": 8.692344288796505e-09, "advantage_min": -0.5089701935648918, "advantage_std": 0.6711876504123211, "completion_length": 3053.7708740234375, "epoch": 0.12457142857142857, "grad_norm": 0.09334102272987366, "kl": 0.005675315856933594, "lambda_div_used": 0.5, "learning_rate": 9.623632283030077e-07, "loss": 0.0178, "reward": -0.24590441305190325, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.24590441305190325, "reward_after_std": 0.6711876504123211, "reward_before_mean": 0.15391119476407766, "reward_before_std": 0.628238283097744, "reward_change_max": 0.0, "reward_change_mean": -0.3998156199231744, "reward_change_min": -0.7111305296421051, "reward_change_std": 0.27578442357480526, "reward_std": 0.6711876578629017, "rewards/cosine_scaled_reward": -0.08971107471734285, "rewards/format_reward": 0.3333333395421505, "step": 109 }, { "advantage_max": 1.5743919759988785, "advantage_mean": 1.3659398279131096e-08, "advantage_min": -0.6370358616113663, "advantage_std": 0.8361404724419117, "completion_length": 3043.0208740234375, "epoch": 0.12571428571428572, "grad_norm": 0.12074672430753708, "kl": 0.009097099304199219, "lambda_div_used": 0.5, "learning_rate": 9.610954559391704e-07, "loss": 0.0472, "reward": -0.17771138809621334, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.17771138809621334, "reward_after_std": 0.8361404724419117, "reward_before_mean": 0.23106246162205935, "reward_before_std": 0.8586977384984493, "reward_change_max": 0.0, "reward_change_mean": -0.4087738338857889, "reward_change_min": -0.9307141229510307, "reward_change_std": 0.3709069453179836, "reward_std": 0.8361404910683632, "rewards/cosine_scaled_reward": -0.08238544082269073, "rewards/format_reward": 0.39583333767950535, "step": 110 }, { "advantage_max": 1.231549710035324, "advantage_mean": 7.450580929990736e-09, "advantage_min": -0.690878614783287, "advantage_std": 0.6834078840911388, "completion_length": 3437.875030517578, "epoch": 0.12685714285714286, "grad_norm": 0.12677572667598724, "kl": 0.0095977783203125, "lambda_div_used": 0.5, "learning_rate": 9.598076473627796e-07, "loss": 0.0257, "reward": -0.21460825204849243, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.21460825204849243, "reward_after_std": 0.6834078989923, "reward_before_mean": 0.22259395569562912, "reward_before_std": 0.7492858357727528, "reward_change_max": 0.0012400075793266296, "reward_change_mean": -0.4372022282332182, "reward_change_min": -0.8609942458570004, "reward_change_std": 0.37953019607812166, "reward_std": 0.6834079250693321, "rewards/cosine_scaled_reward": -0.013703018426895142, "rewards/format_reward": 0.2500000074505806, "step": 111 }, { "advantage_max": 1.390410177409649, "advantage_mean": 2.6697914878859308e-08, "advantage_min": -0.5769059509038925, "advantage_std": 0.7328597828745842, "completion_length": 3354.541717529297, "epoch": 0.128, "grad_norm": 0.13290849328041077, "kl": 0.005608558654785156, "lambda_div_used": 0.5, "learning_rate": 9.58499865339809e-07, "loss": 0.0329, "reward": 0.019102992489933968, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.019102992489933968, "reward_after_std": 0.7328597791492939, "reward_before_mean": 0.6243925159797072, "reward_before_std": 0.6275855414569378, "reward_change_max": 0.0005017220973968506, "reward_change_mean": -0.6052894797176123, "reward_change_min": -0.9622363597154617, "reward_change_std": 0.40619419515132904, "reward_std": 0.7328597903251648, "rewards/cosine_scaled_reward": 0.12469624355435371, "rewards/format_reward": 0.3750000037252903, "step": 112 }, { "advantage_max": 1.474353551864624, "advantage_mean": 9.313226190243995e-09, "advantage_min": -0.5791169889271259, "advantage_std": 0.7638243734836578, "completion_length": 2959.8750610351562, "epoch": 0.12914285714285714, "grad_norm": 0.2007800042629242, "kl": 0.011220932006835938, "lambda_div_used": 0.5, "learning_rate": 9.571721736097088e-07, "loss": 0.0969, "reward": -0.2861519819125533, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2861519819125533, "reward_after_std": 0.7638243734836578, "reward_before_mean": 0.049024675972759724, "reward_before_std": 0.7449628822505474, "reward_change_max": 0.00041237473487854004, "reward_change_mean": -0.33517665788531303, "reward_change_min": -0.7155702412128448, "reward_change_std": 0.2762441807426512, "reward_std": 0.7638244070112705, "rewards/cosine_scaled_reward": -0.17340433155186474, "rewards/format_reward": 0.3958333432674408, "step": 113 }, { "advantage_max": 1.3445461466908455, "advantage_mean": 3.725290742551124e-09, "advantage_min": -0.4992591068148613, "advantage_std": 0.699359804391861, "completion_length": 2696.500068664551, "epoch": 0.13028571428571428, "grad_norm": 0.10839706659317017, "kl": 0.006946563720703125, "lambda_div_used": 0.5, "learning_rate": 9.55824636882301e-07, "loss": 0.0255, "reward": -0.26514838729053736, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.26514838729053736, "reward_after_std": 0.6993598081171513, "reward_before_mean": 0.11108050495386124, "reward_before_std": 0.6611098386347294, "reward_change_max": 0.001189887523651123, "reward_change_mean": -0.3762288950383663, "reward_change_min": -0.7401161417365074, "reward_change_std": 0.29690456204116344, "reward_std": 0.6993598081171513, "rewards/cosine_scaled_reward": -0.21529308333992958, "rewards/format_reward": 0.5416666772216558, "step": 114 }, { "advantage_max": 1.2387553304433823, "advantage_mean": -3.3306690738754696e-16, "advantage_min": -0.4966389983892441, "advantage_std": 0.6462426483631134, "completion_length": 2998.0208740234375, "epoch": 0.13142857142857142, "grad_norm": 0.08023947477340698, "kl": 0.007321357727050781, "lambda_div_used": 0.5, "learning_rate": 9.54457320834625e-07, "loss": 0.0372, "reward": -0.3627615012228489, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3627615012228489, "reward_after_std": 0.6462426148355007, "reward_before_mean": -0.05127234756946564, "reward_before_std": 0.6328899078071117, "reward_change_max": 0.001132287085056305, "reward_change_mean": -0.311489156447351, "reward_change_min": -0.6855304539203644, "reward_change_std": 0.26617162115871906, "reward_std": 0.6462426371872425, "rewards/cosine_scaled_reward": -0.17146951146423817, "rewards/format_reward": 0.2916666716337204, "step": 115 }, { "advantage_max": 1.5859102010726929, "advantage_mean": 2.220446049250313e-16, "advantage_min": -0.5750259384512901, "advantage_std": 0.8160628713667393, "completion_length": 3465.000030517578, "epoch": 0.13257142857142856, "grad_norm": 0.1530701220035553, "kl": 0.007579803466796875, "lambda_div_used": 0.5, "learning_rate": 9.530702921077358e-07, "loss": 0.0211, "reward": -0.32605776842683554, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.32605776842683554, "reward_after_std": 0.8160628564655781, "reward_before_mean": -0.04084068536758423, "reward_before_std": 0.8062328286468983, "reward_change_max": 0.0008537173271179199, "reward_change_mean": -0.28521708957850933, "reward_change_min": -0.5912484526634216, "reward_change_std": 0.2459753742441535, "reward_std": 0.8160628788173199, "rewards/cosine_scaled_reward": -0.1037536843214184, "rewards/format_reward": 0.1666666716337204, "step": 116 }, { "advantage_max": 1.0990208461880684, "advantage_mean": 2.359350581571107e-08, "advantage_min": -0.3993668518960476, "advantage_std": 0.5640300773084164, "completion_length": 3263.916717529297, "epoch": 0.1337142857142857, "grad_norm": 0.08252304047346115, "kl": 0.010234832763671875, "lambda_div_used": 0.5, "learning_rate": 9.516636183034564e-07, "loss": 0.0192, "reward": -0.4662362337112427, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4662362337112427, "reward_after_std": 0.5640300661325455, "reward_before_mean": -0.22322246711701155, "reward_before_std": 0.5282933078706264, "reward_change_max": 0.0006141439080238342, "reward_change_mean": -0.24301375821232796, "reward_change_min": -0.5001362860202789, "reward_change_std": 0.19844415225088596, "reward_std": 0.5640300773084164, "rewards/cosine_scaled_reward": -0.22619456890970469, "rewards/format_reward": 0.22916666977107525, "step": 117 }, { "advantage_max": 1.5211386159062386, "advantage_mean": -2.483527050678447e-09, "advantage_min": -0.755425862967968, "advantage_std": 0.8458346724510193, "completion_length": 3119.2083435058594, "epoch": 0.13485714285714287, "grad_norm": 0.15521512925624847, "kl": 0.00604248046875, "lambda_div_used": 0.5, "learning_rate": 9.502373679810839e-07, "loss": 0.0558, "reward": -0.12143947370350361, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.12143947370350361, "reward_after_std": 0.8458346650004387, "reward_before_mean": 0.3443289026618004, "reward_before_std": 0.9570517912507057, "reward_change_max": 0.001078963279724121, "reward_change_mean": -0.4657684173434973, "reward_change_min": -1.147033091634512, "reward_change_std": 0.46905333921313286, "reward_std": 0.8458347134292126, "rewards/cosine_scaled_reward": 0.01591446064412594, "rewards/format_reward": 0.3125000037252903, "step": 118 }, { "advantage_max": 1.3600810691714287, "advantage_mean": 1.2417638028949796e-09, "advantage_min": -0.7468793988227844, "advantage_std": 0.7505389116704464, "completion_length": 2491.5833740234375, "epoch": 0.136, "grad_norm": 0.26733919978141785, "kl": 0.108642578125, "lambda_div_used": 0.5, "learning_rate": 9.487916106540465e-07, "loss": 0.0267, "reward": -0.01912129484117031, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.01912129484117031, "reward_after_std": 0.750538919121027, "reward_before_mean": 0.5529840067028999, "reward_before_std": 0.7970850095152855, "reward_change_max": 0.006842672824859619, "reward_change_mean": -0.5721052885055542, "reward_change_min": -1.084109291434288, "reward_change_std": 0.4520927872508764, "reward_std": 0.7505389600992203, "rewards/cosine_scaled_reward": 0.016075339168310165, "rewards/format_reward": 0.5208333432674408, "step": 119 }, { "advantage_max": 1.5714844167232513, "advantage_mean": -3.1044086745701804e-09, "advantage_min": -0.6521871276199818, "advantage_std": 0.8299883455038071, "completion_length": 2594.375045776367, "epoch": 0.13714285714285715, "grad_norm": 0.1550913006067276, "kl": 0.01110076904296875, "lambda_div_used": 0.5, "learning_rate": 9.473264167865171e-07, "loss": -0.0004, "reward": 0.07176335965050384, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07176335965050384, "reward_after_std": 0.8299883455038071, "reward_before_mean": 0.6873215809464455, "reward_before_std": 0.7673061080276966, "reward_change_max": 0.0006802454590797424, "reward_change_mean": -0.6155582182109356, "reward_change_min": -1.1995624154806137, "reward_change_std": 0.4468677435070276, "reward_std": 0.8299883641302586, "rewards/cosine_scaled_reward": 0.062410795129835606, "rewards/format_reward": 0.562500013038516, "step": 120 }, { "advantage_max": 1.3429175913333893, "advantage_mean": 6.208816794028849e-10, "advantage_min": -0.591683205217123, "advantage_std": 0.698270071297884, "completion_length": 2149.8958740234375, "epoch": 0.1382857142857143, "grad_norm": 0.2023954689502716, "kl": 0.013248443603515625, "lambda_div_used": 0.5, "learning_rate": 9.458418577899774e-07, "loss": 0.0709, "reward": -0.12103883270174265, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.12103883270174265, "reward_after_std": 0.698270071297884, "reward_before_mean": 0.3712943270802498, "reward_before_std": 0.6273719593882561, "reward_change_max": 0.0, "reward_change_mean": -0.49233318492770195, "reward_change_min": -0.8092544637620449, "reward_change_std": 0.31992023810744286, "reward_std": 0.698270071297884, "rewards/cosine_scaled_reward": -0.1268528364598751, "rewards/format_reward": 0.6250000055879354, "step": 121 }, { "advantage_max": 1.5647741705179214, "advantage_mean": -6.208815128694312e-10, "advantage_min": -0.7578405737876892, "advantage_std": 0.8595303483307362, "completion_length": 2891.0416717529297, "epoch": 0.13942857142857143, "grad_norm": 0.15973925590515137, "kl": 0.008546829223632812, "lambda_div_used": 0.5, "learning_rate": 9.443380060197385e-07, "loss": 0.0531, "reward": -0.08176559396088123, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08176559396088123, "reward_after_std": 0.8595303483307362, "reward_before_mean": 0.4093674011528492, "reward_before_std": 0.9295827485620975, "reward_change_max": 0.001126319169998169, "reward_change_mean": -0.4911329858005047, "reward_change_min": -1.1455222107470036, "reward_change_std": 0.4624634627252817, "reward_std": 0.8595303595066071, "rewards/cosine_scaled_reward": -0.014066309202462435, "rewards/format_reward": 0.43750000931322575, "step": 122 }, { "advantage_max": 1.4153951033949852, "advantage_mean": 6.829699139565548e-09, "advantage_min": -0.5769023820757866, "advantage_std": 0.7529639042913914, "completion_length": 3128.0625610351562, "epoch": 0.14057142857142857, "grad_norm": 0.133604496717453, "kl": 0.007900238037109375, "lambda_div_used": 0.5, "learning_rate": 9.428149347714143e-07, "loss": 0.0562, "reward": -0.24905523657798767, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.24905523657798767, "reward_after_std": 0.7529638931155205, "reward_before_mean": 0.1262472216039896, "reward_before_std": 0.7704966329038143, "reward_change_max": 0.001382298767566681, "reward_change_mean": -0.37530247680842876, "reward_change_min": -0.8108473680913448, "reward_change_std": 0.34216398000717163, "reward_std": 0.7529639154672623, "rewards/cosine_scaled_reward": -0.11395972780883312, "rewards/format_reward": 0.35416666977107525, "step": 123 }, { "advantage_max": 1.3590355888009071, "advantage_mean": -4.967053879312289e-09, "advantage_min": -0.5283945128321648, "advantage_std": 0.703603483736515, "completion_length": 2643.3125534057617, "epoch": 0.1417142857142857, "grad_norm": 0.10262785851955414, "kl": 0.010004043579101562, "lambda_div_used": 0.5, "learning_rate": 9.412727182773486e-07, "loss": 0.0245, "reward": -0.10296567948535085, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10296567948535085, "reward_after_std": 0.7036034874618053, "reward_before_mean": 0.4043073896318674, "reward_before_std": 0.5893766656517982, "reward_change_max": 0.0, "reward_change_mean": -0.5072730649262667, "reward_change_min": -0.900951974093914, "reward_change_std": 0.35020011104643345, "reward_std": 0.7036035098135471, "rewards/cosine_scaled_reward": -0.027012981940060854, "rewards/format_reward": 0.45833334140479565, "step": 124 }, { "advantage_max": 0.9600905813276768, "advantage_mean": 1.8005570145973593e-08, "advantage_min": -0.5070587620139122, "advantage_std": 0.5214144960045815, "completion_length": 2769.6666870117188, "epoch": 0.14285714285714285, "grad_norm": 0.05761028453707695, "kl": 0.00707244873046875, "lambda_div_used": 0.5, "learning_rate": 9.397114317029974e-07, "loss": 0.0083, "reward": -0.2994791641831398, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2994791641831398, "reward_after_std": 0.5214144885540009, "reward_before_mean": 0.10765105485916138, "reward_before_std": 0.5211964379996061, "reward_change_max": 0.001614697277545929, "reward_change_mean": -0.40713020414114, "reward_change_min": -0.7412909679114819, "reward_change_std": 0.3003601636737585, "reward_std": 0.5214144997298717, "rewards/cosine_scaled_reward": -0.09200781211256981, "rewards/format_reward": 0.2916666679084301, "step": 125 }, { "advantage_max": 1.369416281580925, "advantage_mean": 4.3461718668424965e-09, "advantage_min": -0.5342150218784809, "advantage_std": 0.718287467956543, "completion_length": 2917.666732788086, "epoch": 0.144, "grad_norm": 0.11026092618703842, "kl": 0.0053558349609375, "lambda_div_used": 0.5, "learning_rate": 9.381311511432658e-07, "loss": 0.0133, "reward": -0.14198202081024647, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.14198202081024647, "reward_after_std": 0.7182874493300915, "reward_before_mean": 0.3325531389564276, "reward_before_std": 0.6610416546463966, "reward_change_max": 0.00305301696062088, "reward_change_mean": -0.4745351132005453, "reward_change_min": -0.8417270965874195, "reward_change_std": 0.34801490139216185, "reward_std": 0.7182874642312527, "rewards/cosine_scaled_reward": -0.07330678962171078, "rewards/format_reward": 0.47916667349636555, "step": 126 }, { "advantage_max": 1.3165396451950073, "advantage_mean": -9.93410786964688e-09, "advantage_min": -0.5570442825555801, "advantage_std": 0.7028694860637188, "completion_length": 3262.8541870117188, "epoch": 0.14514285714285713, "grad_norm": 0.11621111631393433, "kl": 0.00928497314453125, "lambda_div_used": 0.5, "learning_rate": 9.36531953618799e-07, "loss": 0.041, "reward": -0.4189454587176442, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.4189454587176442, "reward_after_std": 0.7028694786131382, "reward_before_mean": -0.16826728964224458, "reward_before_std": 0.7615924999117851, "reward_change_max": 0.0017796605825424194, "reward_change_mean": -0.2506781928241253, "reward_change_min": -0.7534624300897121, "reward_change_std": 0.301827160641551, "reward_std": 0.7028694786131382, "rewards/cosine_scaled_reward": -0.22996697621420026, "rewards/format_reward": 0.2916666716337204, "step": 127 }, { "advantage_max": 1.6697375662624836, "advantage_mean": 2.173086055545781e-08, "advantage_min": -0.7176847979426384, "advantage_std": 0.9009743053466082, "completion_length": 2810.4583740234375, "epoch": 0.1462857142857143, "grad_norm": 0.11985532194375992, "kl": 0.007503509521484375, "lambda_div_used": 0.5, "learning_rate": 9.34913917072228e-07, "loss": 0.0328, "reward": -0.04913006443530321, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.04913006443530321, "reward_after_std": 0.9009743090718985, "reward_before_mean": 0.452212794451043, "reward_before_std": 0.9385515972971916, "reward_change_max": 0.0008388608694076538, "reward_change_mean": -0.5013428647071123, "reward_change_min": -0.9899672120809555, "reward_change_std": 0.42710711527615786, "reward_std": 0.9009743295609951, "rewards/cosine_scaled_reward": 0.01777306676376611, "rewards/format_reward": 0.4166666716337204, "step": 128 }, { "advantage_max": 1.5713545978069305, "advantage_mean": 1.117587078436344e-08, "advantage_min": -0.5929481983184814, "advantage_std": 0.8208220899105072, "completion_length": 3373.1458435058594, "epoch": 0.14742857142857144, "grad_norm": 0.21055659651756287, "kl": 0.01023101806640625, "lambda_div_used": 0.5, "learning_rate": 9.332771203643714e-07, "loss": 0.0635, "reward": -0.34678191784769297, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.34678191784769297, "reward_after_std": 0.8208221010863781, "reward_before_mean": -0.07890627905726433, "reward_before_std": 0.8488923981785774, "reward_change_max": 0.0015719234943389893, "reward_change_mean": -0.26787563413381577, "reward_change_min": -0.6541193760931492, "reward_change_std": 0.2729021832346916, "reward_std": 0.8208221010863781, "rewards/cosine_scaled_reward": -0.13320314860902727, "rewards/format_reward": 0.1875000037252903, "step": 129 }, { "advantage_max": 1.3760394155979156, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.5131339877843857, "advantage_std": 0.7071768157184124, "completion_length": 3168.5833740234375, "epoch": 0.14857142857142858, "grad_norm": 0.10870277136564255, "kl": 0.009489059448242188, "lambda_div_used": 0.5, "learning_rate": 9.316216432703916e-07, "loss": 0.0555, "reward": -0.3918801546096802, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3918801546096802, "reward_after_std": 0.70717678591609, "reward_before_mean": -0.1307674515992403, "reward_before_std": 0.6902662832289934, "reward_change_max": 0.0022120848298072815, "reward_change_mean": -0.26111270394176245, "reward_change_min": -0.5894866921007633, "reward_change_std": 0.2326252982020378, "reward_std": 0.7071768119931221, "rewards/cosine_scaled_reward": -0.16955040022730827, "rewards/format_reward": 0.2083333358168602, "step": 130 }, { "advantage_max": 1.5219105705618858, "advantage_mean": 2.0489097529718947e-08, "advantage_min": -0.7303340062499046, "advantage_std": 0.8300552815198898, "completion_length": 2928.6875762939453, "epoch": 0.14971428571428572, "grad_norm": 0.1419273018836975, "kl": 0.01134490966796875, "lambda_div_used": 0.5, "learning_rate": 9.299475664759068e-07, "loss": 0.0631, "reward": -0.060884641483426094, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.060884641483426094, "reward_after_std": 0.8300552740693092, "reward_before_mean": 0.4547289116308093, "reward_before_std": 0.8692549020051956, "reward_change_max": 0.0035802796483039856, "reward_change_mean": -0.5156135559082031, "reward_change_min": -1.0946178510785103, "reward_change_std": 0.44368345849215984, "reward_std": 0.8300552740693092, "rewards/cosine_scaled_reward": 0.029447784181684256, "rewards/format_reward": 0.39583334140479565, "step": 131 }, { "advantage_max": 2.0168206430971622, "advantage_mean": 3.104408841103634e-09, "advantage_min": -0.8502785339951515, "advantage_std": 1.0592867005616426, "completion_length": 2758.062515258789, "epoch": 0.15085714285714286, "grad_norm": 0.17530718445777893, "kl": 0.00824737548828125, "lambda_div_used": 0.5, "learning_rate": 9.282549715730579e-07, "loss": 0.0628, "reward": 0.035275431582704186, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.035275431582704186, "reward_after_std": 1.0592866968363523, "reward_before_mean": 0.543465293943882, "reward_before_std": 1.0686840042471886, "reward_change_max": 0.0006739124655723572, "reward_change_mean": -0.5081898486241698, "reward_change_min": -1.1129607036709785, "reward_change_std": 0.44032883644104004, "reward_std": 1.0592867471277714, "rewards/cosine_scaled_reward": 0.052982633154897485, "rewards/format_reward": 0.43750000558793545, "step": 132 }, { "advantage_max": 0.970925759524107, "advantage_mean": 3.2285850104507574e-08, "advantage_min": -0.3871290944516659, "advantage_std": 0.5082768350839615, "completion_length": 3336.2291870117188, "epoch": 0.152, "grad_norm": 0.08025325834751129, "kl": 0.012659072875976562, "lambda_div_used": 0.5, "learning_rate": 9.265439410565328e-07, "loss": 0.013, "reward": -0.49410490319132805, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.49410490319132805, "reward_after_std": 0.5082768425345421, "reward_before_mean": -0.248309426009655, "reward_before_std": 0.5010492280125618, "reward_change_max": 0.0004126057028770447, "reward_change_mean": -0.24579549115151167, "reward_change_min": -0.5601531192660332, "reward_change_std": 0.21191422455012798, "reward_std": 0.508276853710413, "rewards/cosine_scaled_reward": -0.2179047055542469, "rewards/format_reward": 0.18750000186264515, "step": 133 }, { "advantage_max": 1.5898833870887756, "advantage_mean": 4.035731165918932e-09, "advantage_min": -0.6322812959551811, "advantage_std": 0.8210309743881226, "completion_length": 2692.0416870117188, "epoch": 0.15314285714285714, "grad_norm": 0.16357369720935822, "kl": 0.0129547119140625, "lambda_div_used": 0.5, "learning_rate": 9.248145583195447e-07, "loss": 0.0494, "reward": -0.11071780603379011, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11071780603379011, "reward_after_std": 0.8210309743881226, "reward_before_mean": 0.354950032196939, "reward_before_std": 0.7637733723968267, "reward_change_max": 4.401057958602905e-05, "reward_change_mean": -0.4656678568571806, "reward_change_min": -0.8694765791296959, "reward_change_std": 0.3447660394012928, "reward_std": 0.8210310265421867, "rewards/cosine_scaled_reward": -0.06210831506177783, "rewards/format_reward": 0.4791666716337204, "step": 134 }, { "advantage_max": 1.6711772456765175, "advantage_mean": 1.862645193639878e-08, "advantage_min": -0.806564062833786, "advantage_std": 0.8973346874117851, "completion_length": 2051.729217529297, "epoch": 0.15428571428571428, "grad_norm": 0.16736926138401031, "kl": 0.010089874267578125, "lambda_div_used": 0.5, "learning_rate": 9.230669076497687e-07, "loss": 0.0708, "reward": 0.21743118949234486, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21743118949234486, "reward_after_std": 0.89733468554914, "reward_before_mean": 0.9405843168497086, "reward_before_std": 0.8591349758207798, "reward_change_max": 0.0, "reward_change_mean": -0.7231530882418156, "reward_change_min": -1.226291723549366, "reward_change_std": 0.5035424418747425, "reward_std": 0.8973347377032042, "rewards/cosine_scaled_reward": 0.12654214911162853, "rewards/format_reward": 0.687500013038516, "step": 135 }, { "advantage_max": 1.704109400510788, "advantage_mean": -1.6763806842678974e-08, "advantage_min": -0.7793970480561256, "advantage_std": 0.9227009601891041, "completion_length": 2986.1458892822266, "epoch": 0.15542857142857142, "grad_norm": 0.1529574692249298, "kl": 0.0151824951171875, "lambda_div_used": 0.5, "learning_rate": 9.213010742252327e-07, "loss": 0.0333, "reward": 0.06355301290750504, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06355301290750504, "reward_after_std": 0.9227009415626526, "reward_before_mean": 0.6477234736084938, "reward_before_std": 0.9388607256114483, "reward_change_max": 0.006334125995635986, "reward_change_mean": -0.5841704942286015, "reward_change_min": -1.2264491878449917, "reward_change_std": 0.4949139221571386, "reward_std": 0.9227009601891041, "rewards/cosine_scaled_reward": 0.0842784009873867, "rewards/format_reward": 0.4791666753590107, "step": 136 }, { "advantage_max": 1.199564404785633, "advantage_mean": 1.738468857759301e-08, "advantage_min": -0.5005557537078857, "advantage_std": 0.6346891317516565, "completion_length": 3282.6458740234375, "epoch": 0.15657142857142858, "grad_norm": 0.11640360951423645, "kl": 0.0130615234375, "lambda_div_used": 0.5, "learning_rate": 9.195171441101668e-07, "loss": 0.0435, "reward": -0.4002666026353836, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4002666026353836, "reward_after_std": 0.6346891317516565, "reward_before_mean": -0.1202960298396647, "reward_before_std": 0.6554977763444185, "reward_change_max": 0.001410163938999176, "reward_change_mean": -0.2799705620855093, "reward_change_min": -0.6201891824603081, "reward_change_std": 0.2587748169898987, "reward_std": 0.634689137339592, "rewards/cosine_scaled_reward": -0.18514801934361458, "rewards/format_reward": 0.25000000558793545, "step": 137 }, { "advantage_max": 1.3507948219776154, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.49056804925203323, "advantage_std": 0.6830014102160931, "completion_length": 2713.395896911621, "epoch": 0.15771428571428572, "grad_norm": 0.1140730232000351, "kl": 0.0122528076171875, "lambda_div_used": 0.5, "learning_rate": 9.177152042508077e-07, "loss": 0.0291, "reward": -0.12246760074049234, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12246760074049234, "reward_after_std": 0.6830014064908028, "reward_before_mean": 0.3673200160264969, "reward_before_std": 0.5412693619728088, "reward_change_max": 0.00021785497665405273, "reward_change_mean": -0.48978761956095695, "reward_change_min": -0.7547291815280914, "reward_change_std": 0.2954326942563057, "reward_std": 0.6830014288425446, "rewards/cosine_scaled_reward": -0.05592333839740604, "rewards/format_reward": 0.47916668094694614, "step": 138 }, { "advantage_max": 1.6214874014258385, "advantage_mean": 3.725290520506519e-09, "advantage_min": -0.5952094718813896, "advantage_std": 0.8399164713919163, "completion_length": 3305.8333740234375, "epoch": 0.15885714285714286, "grad_norm": 0.1574862152338028, "kl": 0.01418304443359375, "lambda_div_used": 0.5, "learning_rate": 9.158953424711624e-07, "loss": 0.0403, "reward": -0.3565631117671728, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3565631117671728, "reward_after_std": 0.8399164713919163, "reward_before_mean": -0.1034796079620719, "reward_before_std": 0.8613853231072426, "reward_change_max": 0.0007909610867500305, "reward_change_mean": -0.2530834935605526, "reward_change_min": -0.65073337033391, "reward_change_std": 0.26929632388055325, "reward_std": 0.8399164862930775, "rewards/cosine_scaled_reward": -0.166323134675622, "rewards/format_reward": 0.22916666977107525, "step": 139 }, { "advantage_max": 1.4743750020861626, "advantage_mean": 3.725290298461914e-09, "advantage_min": -0.5049934312701225, "advantage_std": 0.7563006207346916, "completion_length": 3242.5833740234375, "epoch": 0.16, "grad_norm": 0.3012092411518097, "kl": 0.01766204833984375, "lambda_div_used": 0.5, "learning_rate": 9.140576474687263e-07, "loss": 0.0329, "reward": -0.19406265020370483, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.19406265020370483, "reward_after_std": 0.7563006244599819, "reward_before_mean": 0.20776143670082092, "reward_before_std": 0.6639534346759319, "reward_change_max": 0.0021727383136749268, "reward_change_mean": -0.40182408690452576, "reward_change_min": -0.7207726016640663, "reward_change_std": 0.28624863363802433, "reward_std": 0.7563006617128849, "rewards/cosine_scaled_reward": -0.03153595281764865, "rewards/format_reward": 0.27083333767950535, "step": 140 }, { "advantage_max": 1.5581751950085163, "advantage_mean": 1.4280279625467074e-08, "advantage_min": -0.7472601998597383, "advantage_std": 0.842759732156992, "completion_length": 2783.8126068115234, "epoch": 0.16114285714285714, "grad_norm": 0.14586445689201355, "kl": 0.01715850830078125, "lambda_div_used": 0.5, "learning_rate": 9.122022088101613e-07, "loss": 0.0588, "reward": -0.013555251061916351, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.013555251061916351, "reward_after_std": 0.842759732156992, "reward_before_mean": 0.5331528140231967, "reward_before_std": 0.8482963526621461, "reward_change_max": 0.0022571608424186707, "reward_change_mean": -0.5467080399394035, "reward_change_min": -1.014662615954876, "reward_change_std": 0.4375216653570533, "reward_std": 0.8427597507834435, "rewards/cosine_scaled_reward": -0.014673600438982248, "rewards/format_reward": 0.5625000093132257, "step": 141 }, { "advantage_max": 1.253233052790165, "advantage_mean": 1.8626450382086546e-09, "advantage_min": -0.611926406621933, "advantage_std": 0.6612989082932472, "completion_length": 2920.5625610351562, "epoch": 0.16228571428571428, "grad_norm": 0.11870459467172623, "kl": 0.014678955078125, "lambda_div_used": 0.5, "learning_rate": 9.103291169269299e-07, "loss": 0.0349, "reward": -0.07781748473644257, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.07781748473644257, "reward_after_std": 0.6612989269196987, "reward_before_mean": 0.468132134526968, "reward_before_std": 0.6124387122690678, "reward_change_max": 0.0016920417547225952, "reward_change_mean": -0.5459496257826686, "reward_change_min": -0.9166505560278893, "reward_change_std": 0.36288318363949656, "reward_std": 0.6612989380955696, "rewards/cosine_scaled_reward": -0.0576006043702364, "rewards/format_reward": 0.583333345130086, "step": 142 }, { "advantage_max": 1.2389843165874481, "advantage_mean": 1.6142925496342997e-08, "advantage_min": -0.6687506064772606, "advantage_std": 0.6784803830087185, "completion_length": 2588.562545776367, "epoch": 0.16342857142857142, "grad_norm": 0.21533679962158203, "kl": 0.01389312744140625, "lambda_div_used": 0.5, "learning_rate": 9.084384631108882e-07, "loss": 0.0792, "reward": -0.2079525962471962, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2079525962471962, "reward_after_std": 0.6784803830087185, "reward_before_mean": 0.21878846874460578, "reward_before_std": 0.7084575332701206, "reward_change_max": 0.0006280243396759033, "reward_change_mean": -0.42674104776233435, "reward_change_min": -0.8186662420630455, "reward_change_std": 0.34944797586649656, "reward_std": 0.6784803867340088, "rewards/cosine_scaled_reward": -0.1406057756394148, "rewards/format_reward": 0.5000000149011612, "step": 143 }, { "advantage_max": 1.3775924891233444, "advantage_mean": 1.4901161637936866e-08, "advantage_min": -0.5184234380722046, "advantage_std": 0.7250764183700085, "completion_length": 3109.2292098999023, "epoch": 0.16457142857142856, "grad_norm": 0.12948386371135712, "kl": 0.01511383056640625, "lambda_div_used": 0.5, "learning_rate": 9.065303395098358e-07, "loss": -0.0036, "reward": -0.367202827706933, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.367202827706933, "reward_after_std": 0.7250763960182667, "reward_before_mean": -0.08332528173923492, "reward_before_std": 0.7478654459118843, "reward_change_max": 0.0013224631547927856, "reward_change_mean": -0.2838775431737304, "reward_change_min": -0.7693227715790272, "reward_change_std": 0.29384620860219, "reward_std": 0.7250764183700085, "rewards/cosine_scaled_reward": -0.20832931413315237, "rewards/format_reward": 0.33333334140479565, "step": 144 }, { "advantage_max": 1.552978865802288, "advantage_mean": 4.656612678788363e-09, "advantage_min": -0.5855300799012184, "advantage_std": 0.8020382151007652, "completion_length": 2332.750045776367, "epoch": 0.1657142857142857, "grad_norm": 0.10903175920248032, "kl": 0.014812469482421875, "lambda_div_used": 0.5, "learning_rate": 9.046048391230247e-07, "loss": 0.0034, "reward": 0.07218926632776856, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07218926632776856, "reward_after_std": 0.8020382151007652, "reward_before_mean": 0.6919207079336047, "reward_before_std": 0.6887411586940289, "reward_change_max": 0.001665949821472168, "reward_change_mean": -0.619731426006183, "reward_change_min": -0.9985717423260212, "reward_change_std": 0.3817316296044737, "reward_std": 0.8020382300019264, "rewards/cosine_scaled_reward": 0.033460333943367004, "rewards/format_reward": 0.6250000037252903, "step": 145 }, { "advantage_max": 1.0583246350288391, "advantage_mean": 2.173086055545781e-08, "advantage_min": -0.4859006591141224, "advantage_std": 0.5605090521275997, "completion_length": 2781.2500762939453, "epoch": 0.16685714285714287, "grad_norm": 0.12265187501907349, "kl": 0.01261138916015625, "lambda_div_used": 0.5, "learning_rate": 9.026620557966279e-07, "loss": 0.0693, "reward": -0.36917710676789284, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.36917710676789284, "reward_after_std": 0.5605090446770191, "reward_before_mean": -0.03538452973589301, "reward_before_std": 0.5493589676916599, "reward_change_max": 0.0036334246397018433, "reward_change_mean": -0.3337925784289837, "reward_change_min": -0.6612095944583416, "reward_change_std": 0.26808320358395576, "reward_std": 0.56050905585289, "rewards/cosine_scaled_reward": -0.2676922781392932, "rewards/format_reward": 0.5000000093132257, "step": 146 }, { "advantage_max": 1.379384882748127, "advantage_mean": 2.110997909809953e-08, "advantage_min": -0.6559456661343575, "advantage_std": 0.7488440778106451, "completion_length": 3024.6458892822266, "epoch": 0.168, "grad_norm": 0.2351156324148178, "kl": 0.0204620361328125, "lambda_div_used": 0.5, "learning_rate": 9.007020842191634e-07, "loss": 0.0832, "reward": -0.27363124303519726, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.27363124303519726, "reward_after_std": 0.7488440666347742, "reward_before_mean": 0.08099634572863579, "reward_before_std": 0.813685305416584, "reward_change_max": 0.0024464577436447144, "reward_change_mean": -0.35462760739028454, "reward_change_min": -0.9023499675095081, "reward_change_std": 0.3578293425962329, "reward_std": 0.7488440815359354, "rewards/cosine_scaled_reward": -0.08450182341039181, "rewards/format_reward": 0.2500000037252903, "step": 147 }, { "advantage_max": 1.430250957608223, "advantage_mean": 2.483526828633842e-09, "advantage_min": -0.6970910802483559, "advantage_std": 0.7667691125534475, "completion_length": 2556.3958587646484, "epoch": 0.16914285714285715, "grad_norm": 0.11008276790380478, "kl": 0.017719268798828125, "lambda_div_used": 0.5, "learning_rate": 8.987250199168808e-07, "loss": 0.0022, "reward": -0.10380987264215946, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10380987264215946, "reward_after_std": 0.7667691130191088, "reward_before_mean": 0.3883102908730507, "reward_before_std": 0.7692638714797795, "reward_change_max": 0.0, "reward_change_mean": -0.4921201700344682, "reward_change_min": -0.9062949493527412, "reward_change_std": 0.37739391159266233, "reward_std": 0.7667691316455603, "rewards/cosine_scaled_reward": -0.0870948564261198, "rewards/format_reward": 0.562500013038516, "step": 148 }, { "advantage_max": 1.5377417542040348, "advantage_mean": 1.6763807175745882e-08, "advantage_min": -0.7391314059495926, "advantage_std": 0.8271317277103662, "completion_length": 2969.500045776367, "epoch": 0.1702857142857143, "grad_norm": 0.13372498750686646, "kl": 0.01287078857421875, "lambda_div_used": 0.5, "learning_rate": 8.967309592491052e-07, "loss": 0.0276, "reward": -0.15246623707935214, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.15246623707935214, "reward_after_std": 0.8271317090839148, "reward_before_mean": 0.2821824178099632, "reward_before_std": 0.8760084733366966, "reward_change_max": 0.0019507110118865967, "reward_change_mean": -0.43464863393455744, "reward_change_min": -1.0092763751745224, "reward_change_std": 0.3935320507735014, "reward_std": 0.8271317090839148, "rewards/cosine_scaled_reward": -0.09849213063716888, "rewards/format_reward": 0.47916668094694614, "step": 149 }, { "advantage_max": 1.601058579981327, "advantage_mean": 8.69234451084111e-09, "advantage_min": -0.7131126075983047, "advantage_std": 0.8711162880063057, "completion_length": 3100.9792404174805, "epoch": 0.17142857142857143, "grad_norm": 0.1580471247434616, "kl": 0.0231170654296875, "lambda_div_used": 0.5, "learning_rate": 8.9471999940354e-07, "loss": 0.0167, "reward": -0.19248445704579353, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.19248445704579353, "reward_after_std": 0.8711162842810154, "reward_before_mean": 0.19730104506015778, "reward_before_std": 0.9518158473074436, "reward_change_max": 0.0012674406170845032, "reward_change_mean": -0.38978548534214497, "reward_change_min": -0.9178396500647068, "reward_change_std": 0.39312170818448067, "reward_std": 0.8711163178086281, "rewards/cosine_scaled_reward": -0.08884949050843716, "rewards/format_reward": 0.3750000037252903, "step": 150 }, { "advantage_max": 1.4950109869241714, "advantage_mean": 2.220446049250313e-16, "advantage_min": -0.6828820556402206, "advantage_std": 0.7891075238585472, "completion_length": 2729.9583435058594, "epoch": 0.17257142857142857, "grad_norm": 0.19314298033714294, "kl": 0.0204315185546875, "lambda_div_used": 0.5, "learning_rate": 8.926922383915315e-07, "loss": 0.036, "reward": 0.06169239804148674, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06169239804148674, "reward_after_std": 0.7891075126826763, "reward_before_mean": 0.6801268644630909, "reward_before_std": 0.7161053195595741, "reward_change_max": 0.0, "reward_change_mean": -0.618434488773346, "reward_change_min": -1.1258440501987934, "reward_change_std": 0.4249352663755417, "reward_std": 0.7891075238585472, "rewards/cosine_scaled_reward": 0.05881343060173094, "rewards/format_reward": 0.5625000074505806, "step": 151 }, { "advantage_max": 0.912200003862381, "advantage_mean": 1.4280280014045132e-08, "advantage_min": -0.4642832688987255, "advantage_std": 0.49871931597590446, "completion_length": 2945.000011444092, "epoch": 0.1737142857142857, "grad_norm": 0.09074344485998154, "kl": 0.0227203369140625, "lambda_div_used": 0.5, "learning_rate": 8.906477750432903e-07, "loss": 0.0049, "reward": -0.3652635831385851, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3652635831385851, "reward_after_std": 0.49871933087706566, "reward_before_mean": -0.0032742172479629517, "reward_before_std": 0.5042455215007067, "reward_change_max": 0.001100011169910431, "reward_change_mean": -0.36198936961591244, "reward_change_min": -0.7148860283195972, "reward_change_std": 0.2905337093397975, "reward_std": 0.49871933087706566, "rewards/cosine_scaled_reward": -0.15788711048662663, "rewards/format_reward": 0.3125, "step": 152 }, { "advantage_max": 1.5115925967693329, "advantage_mean": -1.2417633588057697e-09, "advantage_min": -0.5612894706428051, "advantage_std": 0.770535409450531, "completion_length": 2853.979217529297, "epoch": 0.17485714285714285, "grad_norm": 0.1524999886751175, "kl": 0.0258026123046875, "lambda_div_used": 0.5, "learning_rate": 8.88586709003076e-07, "loss": 0.0004, "reward": -0.2571018021553755, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2571018021553755, "reward_after_std": 0.770535409450531, "reward_before_mean": 0.09108775481581688, "reward_before_std": 0.7091003078967333, "reward_change_max": 3.434717655181885e-05, "reward_change_mean": -0.34818957280367613, "reward_change_min": -0.7208801098167896, "reward_change_std": 0.2727233390323818, "reward_std": 0.7705354280769825, "rewards/cosine_scaled_reward": -0.14195612538605928, "rewards/format_reward": 0.37500000931322575, "step": 153 }, { "advantage_max": 1.5345972888171673, "advantage_mean": -6.829699139565548e-09, "advantage_min": -0.6860805973410606, "advantage_std": 0.8205587547272444, "completion_length": 3347.916717529297, "epoch": 0.176, "grad_norm": 0.15438421070575714, "kl": 0.014179229736328125, "lambda_div_used": 0.5, "learning_rate": 8.865091407243394e-07, "loss": 0.035, "reward": 0.020980832166969776, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.020980832166969776, "reward_after_std": 0.8205587510019541, "reward_before_mean": 0.6036945916712284, "reward_before_std": 0.787444993853569, "reward_change_max": 0.00190676748752594, "reward_change_mean": -0.5827137199230492, "reward_change_min": -1.0991328060626984, "reward_change_std": 0.45627478789538145, "reward_std": 0.8205587565898895, "rewards/cosine_scaled_reward": 0.07268060557544231, "rewards/format_reward": 0.4583333358168602, "step": 154 }, { "advantage_max": 1.808014616370201, "advantage_mean": 1.7384688910659918e-08, "advantage_min": -0.6731105744838715, "advantage_std": 0.9314933232963085, "completion_length": 2730.687515258789, "epoch": 0.17714285714285713, "grad_norm": 0.1630096137523651, "kl": 0.02112579345703125, "lambda_div_used": 0.5, "learning_rate": 8.844151714648274e-07, "loss": 0.0479, "reward": -0.0624679122120142, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0624679122120142, "reward_after_std": 0.931493304669857, "reward_before_mean": 0.40027226135134697, "reward_before_std": 0.8790432922542095, "reward_change_max": 0.0011110901832580566, "reward_change_mean": -0.4627401642501354, "reward_change_min": -0.9882379546761513, "reward_change_std": 0.3674464877694845, "reward_std": 0.9314933083951473, "rewards/cosine_scaled_reward": -0.029030536767095327, "rewards/format_reward": 0.4583333395421505, "step": 155 }, { "advantage_max": 1.6551181003451347, "advantage_mean": 1.0554989271494009e-08, "advantage_min": -0.7165040969848633, "advantage_std": 0.8703439943492413, "completion_length": 3049.2291870117188, "epoch": 0.1782857142857143, "grad_norm": 0.25175604224205017, "kl": 0.017902374267578125, "lambda_div_used": 0.5, "learning_rate": 8.823049032816478e-07, "loss": 0.1118, "reward": -0.2424111724831164, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2424111724831164, "reward_after_std": 0.8703440092504025, "reward_before_mean": 0.09557440504431725, "reward_before_std": 0.9049641638994217, "reward_change_max": 0.0012922286987304688, "reward_change_mean": -0.33798559941351414, "reward_change_min": -0.8197051659226418, "reward_change_std": 0.332876767963171, "reward_std": 0.8703440129756927, "rewards/cosine_scaled_reward": -0.06679612956941128, "rewards/format_reward": 0.22916667349636555, "step": 156 }, { "advantage_max": 1.05646251142025, "advantage_mean": 9.31322552411018e-09, "advantage_min": -0.4026567302644253, "advantage_std": 0.5398902297019958, "completion_length": 3296.625030517578, "epoch": 0.17942857142857144, "grad_norm": 0.11682058125734329, "kl": 0.0221710205078125, "lambda_div_used": 0.5, "learning_rate": 8.801784390262943e-07, "loss": 0.0258, "reward": -0.2805541264824569, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2805541264824569, "reward_after_std": 0.539890231564641, "reward_before_mean": 0.1279155914671719, "reward_before_std": 0.4479395393282175, "reward_change_max": 0.0005399435758590698, "reward_change_mean": -0.4084697123616934, "reward_change_min": -0.6387323662638664, "reward_change_std": 0.24938340950757265, "reward_std": 0.5398902371525764, "rewards/cosine_scaled_reward": -0.11312553659081459, "rewards/format_reward": 0.3541666679084301, "step": 157 }, { "advantage_max": 1.5963420271873474, "advantage_mean": 6.208817349140361e-10, "advantage_min": -0.7781121879816055, "advantage_std": 0.8665140904486179, "completion_length": 3194.4375915527344, "epoch": 0.18057142857142858, "grad_norm": 0.2124539315700531, "kl": 0.0201416015625, "lambda_div_used": 0.5, "learning_rate": 8.780358823396352e-07, "loss": 0.0665, "reward": 0.07793341856449842, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07793341856449842, "reward_after_std": 0.8665141090750694, "reward_before_mean": 0.6934919357299805, "reward_before_std": 0.8816309906542301, "reward_change_max": 0.004750244319438934, "reward_change_mean": -0.6155585404485464, "reward_change_min": -1.2228962555527687, "reward_change_std": 0.49037730880081654, "reward_std": 0.8665141351521015, "rewards/cosine_scaled_reward": 0.14882931299507618, "rewards/format_reward": 0.3958333432674408, "step": 158 }, { "advantage_max": 1.2864177525043488, "advantage_mean": 2.0489097030118586e-08, "advantage_min": -0.5405474305152893, "advantage_std": 0.6673975624144077, "completion_length": 3283.8541870117188, "epoch": 0.18171428571428572, "grad_norm": 0.15530923008918762, "kl": 0.0285491943359375, "lambda_div_used": 0.5, "learning_rate": 8.758773376468604e-07, "loss": 0.0649, "reward": -0.4083964992314577, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4083964992314577, "reward_after_std": 0.6673975735902786, "reward_before_mean": -0.14629693608731031, "reward_before_std": 0.6638858169317245, "reward_change_max": 0.0001917034387588501, "reward_change_mean": -0.2620995473116636, "reward_change_min": -0.5487857535481453, "reward_change_std": 0.23622982390224934, "reward_std": 0.6673975922167301, "rewards/cosine_scaled_reward": -0.18773180805146694, "rewards/format_reward": 0.2291666753590107, "step": 159 }, { "advantage_max": 1.461052566766739, "advantage_mean": 8.692344288796505e-09, "advantage_min": -0.6089429929852486, "advantage_std": 0.7506822571158409, "completion_length": 2841.7708740234375, "epoch": 0.18285714285714286, "grad_norm": 0.2132018804550171, "kl": 0.02667999267578125, "lambda_div_used": 0.5, "learning_rate": 8.737029101523929e-07, "loss": 0.0835, "reward": -0.21343960147351027, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.21343960147351027, "reward_after_std": 0.7506822533905506, "reward_before_mean": 0.18066900130361319, "reward_before_std": 0.7000150382518768, "reward_change_max": 0.0030357539653778076, "reward_change_mean": -0.39410861022770405, "reward_change_min": -0.6988341324031353, "reward_change_std": 0.28755941800773144, "reward_std": 0.7506822720170021, "rewards/cosine_scaled_reward": -0.09716550912708044, "rewards/format_reward": 0.37500000186264515, "step": 160 }, { "advantage_max": 1.3866098821163177, "advantage_mean": 6.208816238917336e-10, "advantage_min": -0.647071972489357, "advantage_std": 0.7460334822535515, "completion_length": 2920.750015258789, "epoch": 0.184, "grad_norm": 0.16971638798713684, "kl": 0.028717041015625, "lambda_div_used": 0.5, "learning_rate": 8.715127058347614e-07, "loss": 0.0448, "reward": -0.035258321557193995, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.035258321557193995, "reward_after_std": 0.7460334822535515, "reward_before_mean": 0.519082885235548, "reward_before_std": 0.7315765358507633, "reward_change_max": 0.004067353904247284, "reward_change_mean": -0.5543412175029516, "reward_change_min": -1.0808284804224968, "reward_change_std": 0.428444167599082, "reward_std": 0.7460335120558739, "rewards/cosine_scaled_reward": 0.009541435167193413, "rewards/format_reward": 0.5000000149011612, "step": 161 }, { "advantage_max": 1.3734740167856216, "advantage_mean": 1.6142925329809543e-08, "advantage_min": -0.5869306847453117, "advantage_std": 0.7363872975111008, "completion_length": 3208.0208740234375, "epoch": 0.18514285714285714, "grad_norm": 0.2380620688199997, "kl": 0.0320892333984375, "lambda_div_used": 0.5, "learning_rate": 8.693068314414344e-07, "loss": 0.0282, "reward": -0.2969482094049454, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2969482094049454, "reward_after_std": 0.7363873012363911, "reward_before_mean": 0.043372806161642075, "reward_before_std": 0.776290912181139, "reward_change_max": 0.0018128976225852966, "reward_change_mean": -0.3403210146352649, "reward_change_min": -0.8492111563682556, "reward_change_std": 0.3306410340592265, "reward_std": 0.7363873347640038, "rewards/cosine_scaled_reward": -0.13456360204145312, "rewards/format_reward": 0.3125000074505806, "step": 162 }, { "advantage_max": 1.3436215445399284, "advantage_mean": -9.934108091691485e-09, "advantage_min": -0.7455865144729614, "advantage_std": 0.7360293306410313, "completion_length": 2572.437545776367, "epoch": 0.18628571428571428, "grad_norm": 0.13132235407829285, "kl": 0.0261383056640625, "lambda_div_used": 0.5, "learning_rate": 8.670853944836176e-07, "loss": 0.0077, "reward": 0.09643177315592766, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09643177315592766, "reward_after_std": 0.7360293306410313, "reward_before_mean": 0.7683794125914574, "reward_before_std": 0.7229588590562344, "reward_change_max": 0.0005083903670310974, "reward_change_mean": -0.6719476291909814, "reward_change_min": -1.1431674733757973, "reward_change_std": 0.46183205861598253, "reward_std": 0.7360293418169022, "rewards/cosine_scaled_reward": 0.1029396834783256, "rewards/format_reward": 0.5625000074505806, "step": 163 }, { "advantage_max": 1.4671280607581139, "advantage_mean": 2.5300930905913788e-08, "advantage_min": -0.6642183251678944, "advantage_std": 0.7612626627087593, "completion_length": 2702.4583587646484, "epoch": 0.18742857142857142, "grad_norm": 0.140818789601326, "kl": 0.02923583984375, "lambda_div_used": 0.5, "learning_rate": 8.648485032310144e-07, "loss": -0.0184, "reward": 0.013984514982439578, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.013984514982439578, "reward_after_std": 0.7612626627087593, "reward_before_mean": 0.5976662468165159, "reward_before_std": 0.6583688072860241, "reward_change_max": 0.0011242106556892395, "reward_change_mean": -0.5836816895753145, "reward_change_min": -0.9547205977141857, "reward_change_std": 0.38507608138024807, "reward_std": 0.7612626999616623, "rewards/cosine_scaled_reward": 0.04883309965953231, "rewards/format_reward": 0.5000000055879354, "step": 164 }, { "advantage_max": 1.6381709426641464, "advantage_mean": 2.2351742401394148e-08, "advantage_min": -0.6009313315153122, "advantage_std": 0.850938007235527, "completion_length": 3325.479248046875, "epoch": 0.18857142857142858, "grad_norm": 0.2902795076370239, "kl": 0.0403594970703125, "lambda_div_used": 0.5, "learning_rate": 8.625962667065487e-07, "loss": 0.0991, "reward": -0.3041955577209592, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3041955577209592, "reward_after_std": 0.8509380016475916, "reward_before_mean": -0.013059889897704124, "reward_before_std": 0.8602779507637024, "reward_change_max": 0.0005616173148155212, "reward_change_mean": -0.29113566502928734, "reward_change_min": -0.7681114263832569, "reward_change_std": 0.2988483002409339, "reward_std": 0.8509380277246237, "rewards/cosine_scaled_reward": -0.12111327843740582, "rewards/format_reward": 0.22916666977107525, "step": 165 }, { "advantage_max": 1.4696976244449615, "advantage_mean": -3.104408785592483e-09, "advantage_min": -0.6678441911935806, "advantage_std": 0.7880504056811333, "completion_length": 3159.5625610351562, "epoch": 0.18971428571428572, "grad_norm": 0.1653311848640442, "kl": 0.027587890625, "lambda_div_used": 0.5, "learning_rate": 8.603287946810513e-07, "loss": 0.0269, "reward": -0.23447778564877808, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.23447778564877808, "reward_after_std": 0.7880504205822945, "reward_before_mean": 0.14275594055652618, "reward_before_std": 0.8306904956698418, "reward_change_max": 0.0057152509689331055, "reward_change_mean": -0.3772337343543768, "reward_change_min": -0.8500324971973896, "reward_change_std": 0.36009896732866764, "reward_std": 0.7880504615604877, "rewards/cosine_scaled_reward": -0.08487202413380146, "rewards/format_reward": 0.3125000111758709, "step": 166 }, { "advantage_max": 1.4749844521284103, "advantage_mean": 1.1102230246251565e-16, "advantage_min": -0.6022154316306114, "advantage_std": 0.7785226926207542, "completion_length": 2614.416717529297, "epoch": 0.19085714285714286, "grad_norm": 0.19377173483371735, "kl": 0.025634765625, "lambda_div_used": 0.5, "learning_rate": 8.580461976679099e-07, "loss": 0.0438, "reward": -0.12003645673394203, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12003645673394203, "reward_after_std": 0.7785226963460445, "reward_before_mean": 0.3457608614116907, "reward_before_std": 0.7535338178277016, "reward_change_max": 0.0003101229667663574, "reward_change_mean": -0.4657973274588585, "reward_change_min": -0.9503576084971428, "reward_change_std": 0.36564162001013756, "reward_std": 0.7785227224230766, "rewards/cosine_scaled_reward": -0.16045291302725673, "rewards/format_reward": 0.6666666734963655, "step": 167 }, { "advantage_max": 1.6648427098989487, "advantage_mean": 2.483526828633842e-09, "advantage_min": -0.8055135011672974, "advantage_std": 0.8785872720181942, "completion_length": 2906.2708740234375, "epoch": 0.192, "grad_norm": 0.16217641532421112, "kl": 0.0312652587890625, "lambda_div_used": 0.5, "learning_rate": 8.557485869176825e-07, "loss": 0.0122, "reward": 0.11544627044349909, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11544627044349909, "reward_after_std": 0.8785872720181942, "reward_before_mean": 0.7536546923220158, "reward_before_std": 0.829089242964983, "reward_change_max": 0.000681951642036438, "reward_change_mean": -0.6382084004580975, "reward_change_min": -1.1904875002801418, "reward_change_std": 0.4658457115292549, "reward_std": 0.8785872757434845, "rewards/cosine_scaled_reward": 0.09557732753455639, "rewards/format_reward": 0.5625000111758709, "step": 168 }, { "advantage_max": 1.8562040403485298, "advantage_mean": -1.738468857759301e-08, "advantage_min": -0.6270218789577484, "advantage_std": 0.9351372793316841, "completion_length": 2535.291732788086, "epoch": 0.19314285714285714, "grad_norm": 0.16913841664791107, "kl": 0.04036712646484375, "lambda_div_used": 0.5, "learning_rate": 8.534360744126753e-07, "loss": 0.035, "reward": 0.2956652529537678, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2956652529537678, "reward_after_std": 0.9351372867822647, "reward_before_mean": 1.0529553515370935, "reward_before_std": 0.701921995729208, "reward_change_max": 0.0003681108355522156, "reward_change_mean": -0.7572900727391243, "reward_change_min": -1.1275853663682938, "reward_change_std": 0.4388972017914057, "reward_std": 0.9351373203098774, "rewards/cosine_scaled_reward": 0.24522765818983316, "rewards/format_reward": 0.5625000018626451, "step": 169 }, { "advantage_max": 1.4476363211870193, "advantage_mean": 4.346172199909404e-09, "advantage_min": -0.675914853811264, "advantage_std": 0.7612299062311649, "completion_length": 2383.458396911621, "epoch": 0.19428571428571428, "grad_norm": 0.14560545980930328, "kl": 0.02826690673828125, "lambda_div_used": 0.5, "learning_rate": 8.511087728614862e-07, "loss": 0.0487, "reward": -0.05682095978409052, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.05682095978409052, "reward_after_std": 0.7612299248576164, "reward_before_mean": 0.47058921796269715, "reward_before_std": 0.7103168535977602, "reward_change_max": 0.0016023367643356323, "reward_change_mean": -0.5274101868271828, "reward_change_min": -1.002557884901762, "reward_change_std": 0.38785367645323277, "reward_std": 0.761229932308197, "rewards/cosine_scaled_reward": -0.06678873766213655, "rewards/format_reward": 0.6041666846722364, "step": 170 }, { "advantage_max": 1.6320666521787643, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.6522354558110237, "advantage_std": 0.8446485474705696, "completion_length": 2998.3333587646484, "epoch": 0.19542857142857142, "grad_norm": 0.1538252979516983, "kl": 0.0330810546875, "lambda_div_used": 0.5, "learning_rate": 8.487667956935087e-07, "loss": 0.0287, "reward": -0.11476925574243069, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11476925574243069, "reward_after_std": 0.8446485660970211, "reward_before_mean": 0.33872489258646965, "reward_before_std": 0.7975562885403633, "reward_change_max": 0.0007885992527008057, "reward_change_mean": -0.4534941161982715, "reward_change_min": -0.8434424623847008, "reward_change_std": 0.3448071158491075, "reward_std": 0.8446485921740532, "rewards/cosine_scaled_reward": -0.028554232325404882, "rewards/format_reward": 0.39583334140479565, "step": 171 }, { "advantage_max": 1.4919284507632256, "advantage_mean": -1.1175871117430347e-08, "advantage_min": -0.6620048135519028, "advantage_std": 0.7910696156322956, "completion_length": 2916.354217529297, "epoch": 0.19657142857142856, "grad_norm": 0.13567769527435303, "kl": 0.0409088134765625, "lambda_div_used": 0.5, "learning_rate": 8.464102570534061e-07, "loss": 0.0138, "reward": -0.02154737338423729, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.02154737338423729, "reward_after_std": 0.7910696379840374, "reward_before_mean": 0.5326739009469748, "reward_before_std": 0.7421072386205196, "reward_change_max": 0.0, "reward_change_mean": -0.5542212873697281, "reward_change_min": -0.9738561362028122, "reward_change_std": 0.4017046205699444, "reward_std": 0.7910696603357792, "rewards/cosine_scaled_reward": 0.04758694767951965, "rewards/format_reward": 0.43750000558793545, "step": 172 }, { "advantage_max": 1.25401646271348, "advantage_mean": -6.208816794028849e-10, "advantage_min": -0.4338163882493973, "advantage_std": 0.6396212875843048, "completion_length": 2764.0416831970215, "epoch": 0.1977142857142857, "grad_norm": 0.10223139822483063, "kl": 0.049713134765625, "lambda_div_used": 0.5, "learning_rate": 8.440392717955475e-07, "loss": 0.0166, "reward": -0.36996013298630714, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.36996013298630714, "reward_after_std": 0.6396212838590145, "reward_before_mean": -0.07614278933033347, "reward_before_std": 0.5837610512971878, "reward_change_max": 0.0012845918536186218, "reward_change_mean": -0.29381735995411873, "reward_change_min": -0.5957952998578548, "reward_change_std": 0.2213090155273676, "reward_std": 0.6396212950348854, "rewards/cosine_scaled_reward": -0.24640473164618015, "rewards/format_reward": 0.4166666716337204, "step": 173 }, { "advantage_max": 1.5859555639326572, "advantage_mean": 1.676380712023473e-08, "advantage_min": -0.6190120317041874, "advantage_std": 0.8216863200068474, "completion_length": 2392.7291870117188, "epoch": 0.19885714285714284, "grad_norm": 0.14171089231967926, "kl": 0.0450439453125, "lambda_div_used": 0.5, "learning_rate": 8.416539554784089e-07, "loss": 0.0041, "reward": -0.0845435168594122, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0845435168594122, "reward_after_std": 0.8216863293200731, "reward_before_mean": 0.400400809943676, "reward_before_std": 0.7570674475282431, "reward_change_max": 0.0009220615029335022, "reward_change_mean": -0.484944318421185, "reward_change_min": -0.9629802815616131, "reward_change_std": 0.35314718913286924, "reward_std": 0.8216863460838795, "rewards/cosine_scaled_reward": -0.1018829345703125, "rewards/format_reward": 0.6041666772216558, "step": 174 }, { "advantage_max": 1.261248379945755, "advantage_mean": 9.934108091691485e-09, "advantage_min": -0.6661129705607891, "advantage_std": 0.683073777705431, "completion_length": 2795.50004196167, "epoch": 0.2, "grad_norm": 0.13047651946544647, "kl": 0.0414886474609375, "lambda_div_used": 0.5, "learning_rate": 8.392544243589427e-07, "loss": 0.0349, "reward": -0.044038325548172, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.044038325548172, "reward_after_std": 0.6830737814307213, "reward_before_mean": 0.5270445011556149, "reward_before_std": 0.6684056259691715, "reward_change_max": 0.0006987974047660828, "reward_change_mean": -0.5710828024893999, "reward_change_min": -0.9975682608783245, "reward_change_std": 0.4042843095958233, "reward_std": 0.6830737888813019, "rewards/cosine_scaled_reward": 0.013522235676646233, "rewards/format_reward": 0.5000000111758709, "step": 175 }, { "advantage_max": 1.607403114438057, "advantage_mean": 1.3038516322172455e-08, "advantage_min": -0.6778121329843998, "advantage_std": 0.8579734489321709, "completion_length": 2719.270896911621, "epoch": 0.20114285714285715, "grad_norm": 0.2701166570186615, "kl": 0.0418701171875, "lambda_div_used": 0.5, "learning_rate": 8.368407953869103e-07, "loss": 0.0373, "reward": -0.06047849915921688, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.06047849915921688, "reward_after_std": 0.8579734601080418, "reward_before_mean": 0.4298029188066721, "reward_before_std": 0.8629395943135023, "reward_change_max": 0.0006796494126319885, "reward_change_mean": -0.4902814142405987, "reward_change_min": -1.097361333668232, "reward_change_std": 0.427899737842381, "reward_std": 0.8579734787344933, "rewards/cosine_scaled_reward": -0.0455152140930295, "rewards/format_reward": 0.520833333954215, "step": 176 }, { "advantage_max": 1.4356028512120247, "advantage_mean": 1.8626452047421083e-09, "advantage_min": -0.5858288630843163, "advantage_std": 0.7616856321692467, "completion_length": 3033.062545776367, "epoch": 0.2022857142857143, "grad_norm": 0.22180777788162231, "kl": 0.05340576171875, "lambda_div_used": 0.5, "learning_rate": 8.344131861991828e-07, "loss": 0.0459, "reward": -0.2571666557341814, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2571666557341814, "reward_after_std": 0.7616856321692467, "reward_before_mean": 0.10084620304405689, "reward_before_std": 0.7724887356162071, "reward_change_max": 0.0013925060629844666, "reward_change_mean": -0.3580128587782383, "reward_change_min": -0.75266994535923, "reward_change_std": 0.3098360765725374, "reward_std": 0.7616856396198273, "rewards/cosine_scaled_reward": -0.19957689847797155, "rewards/format_reward": 0.5000000093132257, "step": 177 }, { "advantage_max": 1.3237878009676933, "advantage_mean": -5.587935614226325e-09, "advantage_min": -0.5824862495064735, "advantage_std": 0.706440394744277, "completion_length": 2700.687530517578, "epoch": 0.20342857142857143, "grad_norm": 0.21339182555675507, "kl": 0.058441162109375, "lambda_div_used": 0.5, "learning_rate": 8.319717151140072e-07, "loss": 0.0295, "reward": -0.28272207267582417, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.28272207267582417, "reward_after_std": 0.7064403779804707, "reward_before_mean": 0.06567367631942034, "reward_before_std": 0.7269596587866545, "reward_change_max": 0.003658100962638855, "reward_change_mean": -0.34839577320963144, "reward_change_min": -0.721770029515028, "reward_change_std": 0.3038365198299289, "reward_std": 0.7064404059201479, "rewards/cosine_scaled_reward": -0.14424649812281132, "rewards/format_reward": 0.35416667349636555, "step": 178 }, { "advantage_max": 1.487779911607504, "advantage_mean": 1.3659398279131096e-08, "advantage_min": -0.4699557423591614, "advantage_std": 0.7494055908173323, "completion_length": 2953.3333740234375, "epoch": 0.20457142857142857, "grad_norm": 0.37815913558006287, "kl": 0.0502777099609375, "lambda_div_used": 0.5, "learning_rate": 8.295165011252396e-07, "loss": 0.0936, "reward": -0.32715226151049137, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.32715226151049137, "reward_after_std": 0.7494055908173323, "reward_before_mean": -0.028430516831576824, "reward_before_std": 0.675062196329236, "reward_change_max": 0.0010721608996391296, "reward_change_mean": -0.2987217428162694, "reward_change_min": -0.5488623008131981, "reward_change_std": 0.22199713252484798, "reward_std": 0.7494056057184935, "rewards/cosine_scaled_reward": -0.17046526400372386, "rewards/format_reward": 0.31250000186264515, "step": 179 }, { "advantage_max": 1.5573591962456703, "advantage_mean": 1.8626452602532595e-09, "advantage_min": -0.6234206072986126, "advantage_std": 0.8115277662873268, "completion_length": 2144.208366394043, "epoch": 0.2057142857142857, "grad_norm": 0.1629016101360321, "kl": 0.0549774169921875, "lambda_div_used": 0.5, "learning_rate": 8.270476638965461e-07, "loss": 0.0502, "reward": 0.01475525926798582, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.01475525926798582, "reward_after_std": 0.8115277662873268, "reward_before_mean": 0.5810065679252148, "reward_before_std": 0.7341162748634815, "reward_change_max": 0.0006464123725891113, "reward_change_mean": -0.5662512928247452, "reward_change_min": -1.0189911015331745, "reward_change_std": 0.38147035613656044, "reward_std": 0.8115277960896492, "rewards/cosine_scaled_reward": -0.04283006116747856, "rewards/format_reward": 0.6666666679084301, "step": 180 }, { "advantage_max": 1.4340339675545692, "advantage_mean": -3.1044086745701804e-09, "advantage_min": -0.578693151473999, "advantage_std": 0.7451658025383949, "completion_length": 2921.541732788086, "epoch": 0.20685714285714285, "grad_norm": 0.20287711918354034, "kl": 0.05859375, "lambda_div_used": 0.5, "learning_rate": 8.245653237555705e-07, "loss": 0.0512, "reward": -0.1003317330032587, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1003317330032587, "reward_after_std": 0.7451657950878143, "reward_before_mean": 0.39000407606363297, "reward_before_std": 0.6769847068935633, "reward_change_max": 0.0010607540607452393, "reward_change_mean": -0.49033585004508495, "reward_change_min": -0.9029561765491962, "reward_change_std": 0.3469520937651396, "reward_std": 0.7451658211648464, "rewards/cosine_scaled_reward": -0.002914630458690226, "rewards/format_reward": 0.3958333395421505, "step": 181 }, { "advantage_max": 1.394393615424633, "advantage_mean": 1.8626452047421083e-09, "advantage_min": -0.7053156830370426, "advantage_std": 0.7596918232738972, "completion_length": 2541.437545776367, "epoch": 0.208, "grad_norm": 0.16592997312545776, "kl": 0.0494232177734375, "lambda_div_used": 0.5, "learning_rate": 8.220696016880687e-07, "loss": -0.0005, "reward": -0.022533608600497246, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.022533608600497246, "reward_after_std": 0.759691808372736, "reward_before_mean": 0.5406998414546251, "reward_before_std": 0.769719572737813, "reward_change_max": 0.003669723868370056, "reward_change_mean": -0.5632334500551224, "reward_change_min": -1.0908655300736427, "reward_change_std": 0.4336672220379114, "reward_std": 0.7596918120980263, "rewards/cosine_scaled_reward": -0.021316751837730408, "rewards/format_reward": 0.5833333469927311, "step": 182 }, { "advantage_max": 1.5372853577136993, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.6862039528787136, "advantage_std": 0.8002040609717369, "completion_length": 2360.8958892822266, "epoch": 0.20914285714285713, "grad_norm": 0.2165931612253189, "kl": 0.0673828125, "lambda_div_used": 0.5, "learning_rate": 8.195606193320136e-07, "loss": 0.0391, "reward": 0.06708026025444269, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06708026025444269, "reward_after_std": 0.8002040907740593, "reward_before_mean": 0.6775889825075865, "reward_before_std": 0.7066179402172565, "reward_change_max": 0.0, "reward_change_mean": -0.6105087604373693, "reward_change_min": -0.9791091829538345, "reward_change_std": 0.3944264929741621, "reward_std": 0.8002041131258011, "rewards/cosine_scaled_reward": 0.005461166147142649, "rewards/format_reward": 0.666666679084301, "step": 183 }, { "advantage_max": 1.4124026373028755, "advantage_mean": -1.2417630257388623e-09, "advantage_min": -0.5000558458268642, "advantage_std": 0.729760505259037, "completion_length": 2818.354202270508, "epoch": 0.2102857142857143, "grad_norm": 0.25655215978622437, "kl": 0.072998046875, "lambda_div_used": 0.5, "learning_rate": 8.170384989716657e-07, "loss": 0.0362, "reward": -0.35524504724889994, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.35524504724889994, "reward_after_std": 0.7297605089843273, "reward_before_mean": -0.08101257588714361, "reward_before_std": 0.7060116622596979, "reward_change_max": 0.0005974471569061279, "reward_change_mean": -0.2742324732244015, "reward_change_min": -0.57624626532197, "reward_change_std": 0.22511245217174292, "reward_std": 0.7297605387866497, "rewards/cosine_scaled_reward": -0.21758961910381913, "rewards/format_reward": 0.3541666716337204, "step": 184 }, { "advantage_max": 1.020853940397501, "advantage_mean": 2.2351742234860694e-08, "advantage_min": -0.42880232259631157, "advantage_std": 0.54007213935256, "completion_length": 2751.0833740234375, "epoch": 0.21142857142857144, "grad_norm": 0.15834684669971466, "kl": 0.0701141357421875, "lambda_div_used": 0.5, "learning_rate": 8.145033635316128e-07, "loss": 0.0207, "reward": -0.35438499972224236, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.35438499972224236, "reward_after_std": 0.5400721412152052, "reward_before_mean": -0.006038234569132328, "reward_before_std": 0.505929496139288, "reward_change_max": 0.0008821934461593628, "reward_change_mean": -0.3483467437326908, "reward_change_min": -0.6647264584898949, "reward_change_std": 0.2687018224969506, "reward_std": 0.540072163566947, "rewards/cosine_scaled_reward": -0.23218578845262527, "rewards/format_reward": 0.4583333432674408, "step": 185 }, { "advantage_max": 1.0780138969421387, "advantage_mean": 1.6142925773898753e-08, "advantage_min": -0.6679522022604942, "advantage_std": 0.5969401746988297, "completion_length": 3050.812545776367, "epoch": 0.21257142857142858, "grad_norm": 0.16625259816646576, "kl": 0.090362548828125, "lambda_div_used": 0.5, "learning_rate": 8.119553365707802e-07, "loss": 0.0018, "reward": -0.0695158913731575, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0695158913731575, "reward_after_std": 0.5969401746988297, "reward_before_mean": 0.5104820095002651, "reward_before_std": 0.6082560420036316, "reward_change_max": 0.0033091604709625244, "reward_change_mean": -0.5799979045987129, "reward_change_min": -0.9459269754588604, "reward_change_std": 0.4035164900124073, "reward_std": 0.5969401746988297, "rewards/cosine_scaled_reward": 0.06774100661277771, "rewards/format_reward": 0.375, "step": 186 }, { "advantage_max": 1.2345889136195183, "advantage_mean": 6.208817904251873e-10, "advantage_min": -0.5896378178149462, "advantage_std": 0.6629064604640007, "completion_length": 2472.312545776367, "epoch": 0.21371428571428572, "grad_norm": 0.2719042897224426, "kl": 0.0833282470703125, "lambda_div_used": 0.5, "learning_rate": 8.093945422764069e-07, "loss": 0.0412, "reward": -0.1858983300626278, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1858983300626278, "reward_after_std": 0.6629064828157425, "reward_before_mean": 0.2663856241852045, "reward_before_std": 0.6563301542773843, "reward_change_max": 0.004294879734516144, "reward_change_mean": -0.4522839467972517, "reward_change_min": -0.8460966721177101, "reward_change_std": 0.35423574782907963, "reward_std": 0.6629064977169037, "rewards/cosine_scaled_reward": -0.12722386233508587, "rewards/format_reward": 0.5208333488553762, "step": 187 }, { "advantage_max": 0.6440011262893677, "advantage_mean": 3.973643181165443e-08, "advantage_min": -0.3368586078286171, "advantage_std": 0.3568432927131653, "completion_length": 3203.229217529297, "epoch": 0.21485714285714286, "grad_norm": 0.16227319836616516, "kl": 0.101318359375, "lambda_div_used": 0.5, "learning_rate": 8.068211054579943e-07, "loss": 0.0267, "reward": -0.6190324202179909, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.6190324202179909, "reward_after_std": 0.3568432927131653, "reward_before_mean": -0.4256970062851906, "reward_before_std": 0.3935478888452053, "reward_change_max": 0.0018765032291412354, "reward_change_mean": -0.1933353953063488, "reward_change_min": -0.425104808062315, "reward_change_std": 0.18632372096180916, "reward_std": 0.3568432964384556, "rewards/cosine_scaled_reward": -0.24409850127995014, "rewards/format_reward": 0.06250000186264515, "step": 188 }, { "advantage_max": 1.5661092102527618, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.6449663117527962, "advantage_std": 0.8198821656405926, "completion_length": 2691.125045776367, "epoch": 0.216, "grad_norm": 0.3333768844604492, "kl": 0.105682373046875, "lambda_div_used": 0.5, "learning_rate": 8.04235151541222e-07, "loss": 0.0368, "reward": -0.056773873046040535, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.056773873046040535, "reward_after_std": 0.8198821656405926, "reward_before_mean": 0.45091634057462215, "reward_before_std": 0.7725614793598652, "reward_change_max": 0.0009219944477081299, "reward_change_mean": -0.507690217345953, "reward_change_min": -1.0040309727191925, "reward_change_std": 0.3715070113539696, "reward_std": 0.8198821842670441, "rewards/cosine_scaled_reward": -0.07662516506388783, "rewards/format_reward": 0.6041666734963655, "step": 189 }, { "advantage_max": 1.2479316182434559, "advantage_mean": -1.4280279958533981e-08, "advantage_min": -0.5517519414424896, "advantage_std": 0.6586221437901258, "completion_length": 2229.1458740234375, "epoch": 0.21714285714285714, "grad_norm": 0.15027928352355957, "kl": 0.092498779296875, "lambda_div_used": 0.5, "learning_rate": 8.01636806561836e-07, "loss": 0.0206, "reward": -0.07967419736087322, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.07967419736087322, "reward_after_std": 0.6586221437901258, "reward_before_mean": 0.46196483448147774, "reward_before_std": 0.5788971818983555, "reward_change_max": 0.00156412273645401, "reward_change_mean": -0.5416390751488507, "reward_change_min": -0.9397790506482124, "reward_change_std": 0.37341161631047726, "reward_std": 0.6586221754550934, "rewards/cosine_scaled_reward": -0.05026757996529341, "rewards/format_reward": 0.5625000111758709, "step": 190 }, { "advantage_max": 1.4184372648596764, "advantage_mean": 1.3659398057086491e-08, "advantage_min": -0.6462213322520256, "advantage_std": 0.7566835880279541, "completion_length": 2572.3333740234375, "epoch": 0.21828571428571428, "grad_norm": 0.2690849304199219, "kl": 0.1207275390625, "lambda_div_used": 0.5, "learning_rate": 7.990261971595048e-07, "loss": 0.0306, "reward": -0.1155730914324522, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1155730914324522, "reward_after_std": 0.7566835880279541, "reward_before_mean": 0.362278588116169, "reward_before_std": 0.7694622874259949, "reward_change_max": 0.002707548439502716, "reward_change_mean": -0.4778516888618469, "reward_change_min": -0.9092966057360172, "reward_change_std": 0.36542451940476894, "reward_std": 0.7566836401820183, "rewards/cosine_scaled_reward": -0.037610700353980064, "rewards/format_reward": 0.43750000558793545, "step": 191 }, { "advantage_max": 1.475701242685318, "advantage_mean": 2.2972624080797033e-08, "advantage_min": -0.7762894034385681, "advantage_std": 0.8053352851420641, "completion_length": 3140.791748046875, "epoch": 0.21942857142857142, "grad_norm": 0.29571542143821716, "kl": 0.12591552734375, "lambda_div_used": 0.5, "learning_rate": 7.964034505716476e-07, "loss": 0.0412, "reward": -0.18194560799747705, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.18194560799747705, "reward_after_std": 0.8053352888673544, "reward_before_mean": 0.23417986929416656, "reward_before_std": 0.8801908064633608, "reward_change_max": 0.004965953528881073, "reward_change_mean": -0.416125463321805, "reward_change_min": -0.905586700886488, "reward_change_std": 0.39055878994986415, "reward_std": 0.8053353205323219, "rewards/cosine_scaled_reward": -0.08082673698663712, "rewards/format_reward": 0.39583334513008595, "step": 192 }, { "advantage_max": 1.4207666739821434, "advantage_mean": 6.208817127095756e-09, "advantage_min": -0.5451913326978683, "advantage_std": 0.7405758053064346, "completion_length": 3119.3333892822266, "epoch": 0.22057142857142858, "grad_norm": 0.21056059002876282, "kl": 0.11077880859375, "lambda_div_used": 0.5, "learning_rate": 7.93768694627233e-07, "loss": 0.0192, "reward": -0.32524373196065426, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.32524373196065426, "reward_after_std": 0.7405758053064346, "reward_before_mean": -0.019589267438277602, "reward_before_std": 0.7454305961728096, "reward_change_max": 0.0016882121562957764, "reward_change_mean": -0.30565447825938463, "reward_change_min": -0.7668191641569138, "reward_change_std": 0.2845957148820162, "reward_std": 0.7405758276581764, "rewards/cosine_scaled_reward": -0.17646130733191967, "rewards/format_reward": 0.33333334140479565, "step": 193 }, { "advantage_max": 1.6386329382658005, "advantage_mean": -1.6653345369377348e-16, "advantage_min": -0.9344875812530518, "advantage_std": 0.9019115082919598, "completion_length": 2661.25008392334, "epoch": 0.22171428571428572, "grad_norm": 0.630655825138092, "kl": 0.100555419921875, "lambda_div_used": 0.5, "learning_rate": 7.911220577405484e-07, "loss": 0.0725, "reward": 0.040607784409075975, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.040607784409075975, "reward_after_std": 0.901911523193121, "reward_before_mean": 0.6173225231468678, "reward_before_std": 0.9771861061453819, "reward_change_max": 0.0, "reward_change_mean": -0.5767147559672594, "reward_change_min": -1.134931169450283, "reward_change_std": 0.48344396241009235, "reward_std": 0.9019115567207336, "rewards/cosine_scaled_reward": 0.11074459226801991, "rewards/format_reward": 0.39583334885537624, "step": 194 }, { "advantage_max": 1.5321245640516281, "advantage_mean": 1.1796752963366686e-08, "advantage_min": -0.6784209460020065, "advantage_std": 0.8166707828640938, "completion_length": 2987.3333740234375, "epoch": 0.22285714285714286, "grad_norm": 0.33836349844932556, "kl": 0.1348876953125, "lambda_div_used": 0.5, "learning_rate": 7.884636689049422e-07, "loss": 0.0389, "reward": -0.22936188150197268, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.22936188150197268, "reward_after_std": 0.8166707642376423, "reward_before_mean": 0.14030634611845016, "reward_before_std": 0.8599527664482594, "reward_change_max": 0.0, "reward_change_mean": -0.36966823507100344, "reward_change_min": -0.8392911180853844, "reward_change_std": 0.3513228427618742, "reward_std": 0.8166707865893841, "rewards/cosine_scaled_reward": -0.06526349484920502, "rewards/format_reward": 0.2708333432674408, "step": 195 }, { "advantage_max": 1.4621029123663902, "advantage_mean": 3.104408619059029e-09, "advantage_min": -0.6472999192774296, "advantage_std": 0.7688349261879921, "completion_length": 3229.4583740234375, "epoch": 0.224, "grad_norm": 0.25638365745544434, "kl": 0.16192626953125, "lambda_div_used": 0.5, "learning_rate": 7.857936576865356e-07, "loss": 0.0169, "reward": -0.1755674695596099, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1755674695596099, "reward_after_std": 0.768834937363863, "reward_before_mean": 0.25635097920894623, "reward_before_std": 0.7566685378551483, "reward_change_max": 0.0, "reward_change_mean": -0.4319184827618301, "reward_change_min": -0.8243024758994579, "reward_change_std": 0.3426487520337105, "reward_std": 0.768834937363863, "rewards/cosine_scaled_reward": -0.02807450108230114, "rewards/format_reward": 0.3125000037252903, "step": 196 }, { "advantage_max": 1.9627781957387924, "advantage_mean": -7.450580818968433e-09, "advantage_min": -0.8632603287696838, "advantage_std": 1.062688797712326, "completion_length": 2410.500057220459, "epoch": 0.22514285714285714, "grad_norm": 0.4595416188240051, "kl": 0.1593017578125, "lambda_div_used": 0.5, "learning_rate": 7.831121542179086e-07, "loss": 0.026, "reward": 0.029495095717720687, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.029495095717720687, "reward_after_std": 1.062688760459423, "reward_before_mean": 0.5363987050950527, "reward_before_std": 1.1507313326001167, "reward_change_max": 0.009369902312755585, "reward_change_mean": -0.5069035869091749, "reward_change_min": -1.2301556020975113, "reward_change_std": 0.5150115732103586, "reward_std": 1.0626887753605843, "rewards/cosine_scaled_reward": 0.07028267765417695, "rewards/format_reward": 0.3958333358168602, "step": 197 }, { "advantage_max": 1.6594363003969193, "advantage_mean": -1.1486311984887365e-08, "advantage_min": -0.7000625804066658, "advantage_std": 0.8589765839278698, "completion_length": 2688.1250610351562, "epoch": 0.22628571428571428, "grad_norm": 0.4216662645339966, "kl": 0.1824951171875, "lambda_div_used": 0.5, "learning_rate": 7.804192891917571e-07, "loss": 0.0484, "reward": -0.014411035925149918, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.014411035925149918, "reward_after_std": 0.8589765951037407, "reward_before_mean": 0.5093722362071276, "reward_before_std": 0.8033815808594227, "reward_change_max": 0.0009694769978523254, "reward_change_mean": -0.5237833168357611, "reward_change_min": -0.876764677464962, "reward_change_std": 0.3557278939988464, "reward_std": 0.8589766137301922, "rewards/cosine_scaled_reward": 0.004686110652983189, "rewards/format_reward": 0.500000013038516, "step": 198 }, { "advantage_max": 1.248784601688385, "advantage_mean": -1.8626449826975033e-09, "advantage_min": -0.5464016161859035, "advantage_std": 0.6605355255305767, "completion_length": 2801.3333740234375, "epoch": 0.22742857142857142, "grad_norm": 0.2286648154258728, "kl": 0.186309814453125, "lambda_div_used": 0.5, "learning_rate": 7.777151938545235e-07, "loss": 0.0171, "reward": -0.35405838675796986, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.35405838675796986, "reward_after_std": 0.6605355255305767, "reward_before_mean": -0.04154867585748434, "reward_before_std": 0.6669703125953674, "reward_change_max": 0.0020766928791999817, "reward_change_mean": -0.31250972487032413, "reward_change_min": -0.7029464244842529, "reward_change_std": 0.28745912201702595, "reward_std": 0.6605355553328991, "rewards/cosine_scaled_reward": -0.1770243365317583, "rewards/format_reward": 0.31250001303851604, "step": 199 }, { "advantage_max": 1.615936852991581, "advantage_mean": -1.2417634809303024e-08, "advantage_min": -0.5708847790956497, "advantage_std": 0.8185114078223705, "completion_length": 2348.6042251586914, "epoch": 0.22857142857142856, "grad_norm": 0.318811297416687, "kl": 0.1622314453125, "lambda_div_used": 0.5, "learning_rate": 7.75e-07, "loss": 0.029, "reward": 0.001345137134194374, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.001345137134194374, "reward_after_std": 0.8185114376246929, "reward_before_mean": 0.5466999299824238, "reward_before_std": 0.6768320240080357, "reward_change_max": 0.006076157093048096, "reward_change_mean": -0.5453548207879066, "reward_change_min": -0.9213692545890808, "reward_change_std": 0.35560460947453976, "reward_std": 0.8185114562511444, "rewards/cosine_scaled_reward": 0.012933290250657592, "rewards/format_reward": 0.5208333358168602, "step": 200 }, { "advantage_max": 1.8653700202703476, "advantage_mean": -2.23517424569053e-08, "advantage_min": -0.987945843487978, "advantage_std": 1.0161167159676552, "completion_length": 2228.375068664551, "epoch": 0.2297142857142857, "grad_norm": 0.2828601896762848, "kl": 0.148040771484375, "lambda_div_used": 0.5, "learning_rate": 7.72273839962904e-07, "loss": 0.0173, "reward": 0.3448567260056734, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3448567260056734, "reward_after_std": 1.0161167457699776, "reward_before_mean": 1.136616634670645, "reward_before_std": 1.0092586781829596, "reward_change_max": 0.002497762441635132, "reward_change_mean": -0.7917599156498909, "reward_change_min": -1.446341522037983, "reward_change_std": 0.5879297144711018, "reward_std": 1.0161167681217194, "rewards/cosine_scaled_reward": 0.25580831430852413, "rewards/format_reward": 0.6250000093132257, "step": 201 }, { "advantage_max": 1.617574080824852, "advantage_mean": -8.692344288796505e-09, "advantage_min": -0.6772880628705025, "advantage_std": 0.8404804095625877, "completion_length": 2325.7083587646484, "epoch": 0.23085714285714284, "grad_norm": 0.2522527277469635, "kl": 0.1925048828125, "lambda_div_used": 0.5, "learning_rate": 7.695368466124296e-07, "loss": 0.0241, "reward": 0.24290845077484846, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.24290845077484846, "reward_after_std": 0.840480413287878, "reward_before_mean": 0.9944449234753847, "reward_before_std": 0.6817901022732258, "reward_change_max": 0.0, "reward_change_mean": -0.7515364792197943, "reward_change_min": -1.2624378241598606, "reward_change_std": 0.4821481630206108, "reward_std": 0.8404804356396198, "rewards/cosine_scaled_reward": 0.21597244683653116, "rewards/format_reward": 0.5625000018626451, "step": 202 }, { "advantage_max": 1.8188269883394241, "advantage_mean": 1.1175870895385742e-08, "advantage_min": -0.6880287379026413, "advantage_std": 0.9322909750044346, "completion_length": 2903.1041870117188, "epoch": 0.232, "grad_norm": 0.4888186752796173, "kl": 0.216949462890625, "lambda_div_used": 0.5, "learning_rate": 7.667891533457718e-07, "loss": 0.054, "reward": -0.16240698844194412, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.16240698844194412, "reward_after_std": 0.9322909824550152, "reward_before_mean": 0.21877515967935324, "reward_before_std": 0.9038714617490768, "reward_change_max": 0.0020834803581237793, "reward_change_mean": -0.3811821388080716, "reward_change_min": -0.7407228797674179, "reward_change_std": 0.312047659419477, "reward_std": 0.9322910197079182, "rewards/cosine_scaled_reward": -0.05727909505367279, "rewards/format_reward": 0.33333334140479565, "step": 203 }, { "advantage_max": 1.2213104590773582, "advantage_mean": -4.346171922353648e-09, "advantage_min": -0.5388765670359135, "advantage_std": 0.6483286060392857, "completion_length": 2404.8959197998047, "epoch": 0.23314285714285715, "grad_norm": 0.3916052579879761, "kl": 0.20050048828125, "lambda_div_used": 0.5, "learning_rate": 7.640308940816239e-07, "loss": 0.0321, "reward": -0.05752340517938137, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05752340517938137, "reward_after_std": 0.6483286060392857, "reward_before_mean": 0.49461894296109676, "reward_before_std": 0.5494524594396353, "reward_change_max": 0.0004562288522720337, "reward_change_mean": -0.552142359316349, "reward_change_min": -0.9264970943331718, "reward_change_std": 0.3694954924285412, "reward_std": 0.6483286134898663, "rewards/cosine_scaled_reward": -0.09644054435193539, "rewards/format_reward": 0.6875000074505806, "step": 204 }, { "advantage_max": 1.9968280270695686, "advantage_mean": -1.5522043428362053e-08, "advantage_min": -0.82924984395504, "advantage_std": 1.0424656011164188, "completion_length": 2681.1875915527344, "epoch": 0.2342857142857143, "grad_norm": 0.5587193369865417, "kl": 0.202392578125, "lambda_div_used": 0.5, "learning_rate": 7.612622032536507e-07, "loss": 0.0405, "reward": 0.03933172253891826, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.03933172253891826, "reward_after_std": 1.0424656197428703, "reward_before_mean": 0.5593788512051105, "reward_before_std": 1.0272934138774872, "reward_change_max": 0.0, "reward_change_mean": -0.5200471375137568, "reward_change_min": -1.0572655908763409, "reward_change_std": 0.4100263640284538, "reward_std": 1.042465664446354, "rewards/cosine_scaled_reward": 0.04010608239332214, "rewards/format_reward": 0.479166679084301, "step": 205 }, { "advantage_max": 1.7176525816321373, "advantage_mean": 4.967053990334591e-09, "advantage_min": -0.6258624605834484, "advantage_std": 0.8893463686108589, "completion_length": 3179.979217529297, "epoch": 0.23542857142857143, "grad_norm": 0.5262120962142944, "kl": 0.217041015625, "lambda_div_used": 0.5, "learning_rate": 7.584832158039378e-07, "loss": 0.0088, "reward": -0.18126259616110474, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.18126259616110474, "reward_after_std": 0.8893463760614395, "reward_before_mean": 0.1952662207186222, "reward_before_std": 0.8674014620482922, "reward_change_max": 0.002380073070526123, "reward_change_mean": -0.3765288144350052, "reward_change_min": -0.8658742196857929, "reward_change_std": 0.3221265822649002, "reward_std": 0.889346394687891, "rewards/cosine_scaled_reward": -0.10028356406837702, "rewards/format_reward": 0.3958333395421505, "step": 206 }, { "advantage_max": 1.5335121899843216, "advantage_mean": 2.359350592673337e-08, "advantage_min": -0.5418790131807327, "advantage_std": 0.7710942663252354, "completion_length": 2872.229263305664, "epoch": 0.23657142857142857, "grad_norm": 0.4609461724758148, "kl": 0.242431640625, "lambda_div_used": 0.5, "learning_rate": 7.556940671764124e-07, "loss": 0.0595, "reward": -0.2480186834000051, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2480186834000051, "reward_after_std": 0.7710942775011063, "reward_before_mean": 0.09909067209810019, "reward_before_std": 0.6730157136917114, "reward_change_max": 0.0010094791650772095, "reward_change_mean": -0.34710934944450855, "reward_change_min": -0.6180264130234718, "reward_change_std": 0.24715251475572586, "reward_std": 0.7710942812263966, "rewards/cosine_scaled_reward": -0.2212880039587617, "rewards/format_reward": 0.5416666753590107, "step": 207 }, { "advantage_max": 1.6003206372261047, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -0.631665613502264, "advantage_std": 0.8309444338083267, "completion_length": 2605.604202270508, "epoch": 0.2377142857142857, "grad_norm": 0.4859123229980469, "kl": 0.222259521484375, "lambda_div_used": 0.5, "learning_rate": 7.528948933102438e-07, "loss": 0.0133, "reward": -0.15951420087367296, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.15951420087367296, "reward_after_std": 0.8309444487094879, "reward_before_mean": 0.25432649441063404, "reward_before_std": 0.7960464023053646, "reward_change_max": 0.0005858913064002991, "reward_change_mean": -0.41384069016203284, "reward_change_min": -0.7964263036847115, "reward_change_std": 0.3161437623202801, "reward_std": 0.8309444785118103, "rewards/cosine_scaled_reward": -0.1124201025813818, "rewards/format_reward": 0.4791666753590107, "step": 208 }, { "advantage_max": 1.5546401739120483, "advantage_mean": -5.587935947293232e-09, "advantage_min": -0.7405244670808315, "advantage_std": 0.8403762653470039, "completion_length": 2657.208396911621, "epoch": 0.23885714285714285, "grad_norm": 0.29709237813949585, "kl": 0.2550048828125, "lambda_div_used": 0.5, "learning_rate": 7.500858306332172e-07, "loss": 0.0441, "reward": 0.06591091491281986, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06591091491281986, "reward_after_std": 0.8403762653470039, "reward_before_mean": 0.6802605744451284, "reward_before_std": 0.8452886939048767, "reward_change_max": 0.0011561810970306396, "reward_change_mean": -0.614349715411663, "reward_change_min": -1.1940862014889717, "reward_change_std": 0.46805957332253456, "reward_std": 0.8403762951493263, "rewards/cosine_scaled_reward": -0.0036197155714035034, "rewards/format_reward": 0.6875000037252903, "step": 209 }, { "advantage_max": 1.196301095187664, "advantage_mean": -1.2417632477834672e-09, "advantage_min": -0.5151055417954922, "advantage_std": 0.6200075708329678, "completion_length": 2680.166717529297, "epoch": 0.24, "grad_norm": 0.3059697151184082, "kl": 0.2135009765625, "lambda_div_used": 0.5, "learning_rate": 7.472670160550848e-07, "loss": 0.0306, "reward": -0.061095981509424746, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.061095981509424746, "reward_after_std": 0.6200075745582581, "reward_before_mean": 0.5075158753897995, "reward_before_std": 0.50667629763484, "reward_change_max": 0.00011307001113891602, "reward_change_mean": -0.5686118816956878, "reward_change_min": -0.9131542295217514, "reward_change_std": 0.34807233698666096, "reward_std": 0.620007585734129, "rewards/cosine_scaled_reward": 0.014174612239003181, "rewards/format_reward": 0.47916667349636555, "step": 210 }, { "advantage_max": 1.583703152835369, "advantage_mean": 1.4901161637936866e-08, "advantage_min": -0.7270540446043015, "advantage_std": 0.837357334792614, "completion_length": 2362.187530517578, "epoch": 0.24114285714285713, "grad_norm": 0.5771478414535522, "kl": 0.2373046875, "lambda_div_used": 0.5, "learning_rate": 7.444385869608921e-07, "loss": -0.0055, "reward": -0.05148214101791382, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05148214101791382, "reward_after_std": 0.8373573496937752, "reward_before_mean": 0.45062290877103806, "reward_before_std": 0.8202326558530331, "reward_change_max": 0.0023512914776802063, "reward_change_mean": -0.502105032093823, "reward_change_min": -0.9305754974484444, "reward_change_std": 0.37797669507563114, "reward_std": 0.8373573534190655, "rewards/cosine_scaled_reward": -0.014271877706050873, "rewards/format_reward": 0.47916668094694614, "step": 211 }, { "advantage_max": 1.3947479017078876, "advantage_mean": 1.490116185998147e-08, "advantage_min": -0.6110854707658291, "advantage_std": 0.7231886181980371, "completion_length": 2476.9375610351562, "epoch": 0.2422857142857143, "grad_norm": 0.6602047681808472, "kl": 0.28125, "lambda_div_used": 0.5, "learning_rate": 7.416006812042827e-07, "loss": -0.0038, "reward": -0.12894845008850098, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12894845008850098, "reward_after_std": 0.7231886237859726, "reward_before_mean": 0.3463945370167494, "reward_before_std": 0.65158936008811, "reward_change_max": 0.000197581946849823, "reward_change_mean": -0.4753429926931858, "reward_change_min": -0.8183762915432453, "reward_change_std": 0.320674704387784, "reward_std": 0.7231886312365532, "rewards/cosine_scaled_reward": -0.08721940265968442, "rewards/format_reward": 0.520833345130086, "step": 212 }, { "advantage_max": 1.5491738989949226, "advantage_mean": -1.2417633588057697e-09, "advantage_min": -0.6663748882710934, "advantage_std": 0.8115720618516207, "completion_length": 2468.4792251586914, "epoch": 0.24342857142857144, "grad_norm": 0.38668930530548096, "kl": 0.28509521484375, "lambda_div_used": 0.5, "learning_rate": 7.387534371007797e-07, "loss": 0.0502, "reward": -0.054109593853354454, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.054109593853354454, "reward_after_std": 0.8115720748901367, "reward_before_mean": 0.45440296456217766, "reward_before_std": 0.7600329406559467, "reward_change_max": 0.0024536699056625366, "reward_change_mean": -0.5085125498007983, "reward_change_min": -0.983875211328268, "reward_change_std": 0.37908973544836044, "reward_std": 0.8115721084177494, "rewards/cosine_scaled_reward": -0.07488186378031969, "rewards/format_reward": 0.6041666753590107, "step": 213 }, { "advantage_max": 1.6328135281801224, "advantage_mean": -4.346171922353648e-09, "advantage_min": -0.7386883497238159, "advantage_std": 0.8717365637421608, "completion_length": 2732.2500610351562, "epoch": 0.24457142857142858, "grad_norm": 0.7322311997413635, "kl": 0.23431396484375, "lambda_div_used": 0.5, "learning_rate": 7.358969934210438e-07, "loss": 0.0818, "reward": -0.05645147990435362, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05645147990435362, "reward_after_std": 0.8717365935444832, "reward_before_mean": 0.442460672929883, "reward_before_std": 0.8829019628465176, "reward_change_max": 0.0, "reward_change_mean": -0.49891215190291405, "reward_change_min": -1.0390591099858284, "reward_change_std": 0.4195642340928316, "reward_std": 0.8717366270720959, "rewards/cosine_scaled_reward": -0.049603000516071916, "rewards/format_reward": 0.5416666734963655, "step": 214 }, { "advantage_max": 1.150866337120533, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.5373395159840584, "advantage_std": 0.612909123301506, "completion_length": 2315.812515258789, "epoch": 0.24571428571428572, "grad_norm": 0.26189979910850525, "kl": 0.209259033203125, "lambda_div_used": 0.5, "learning_rate": 7.330314893841101e-07, "loss": 0.0336, "reward": -0.1552600208669901, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1552600208669901, "reward_after_std": 0.6129091084003448, "reward_before_mean": 0.33281449880450964, "reward_before_std": 0.574024710804224, "reward_change_max": 0.00011177361011505127, "reward_change_mean": -0.48807452619075775, "reward_change_min": -0.8746481277048588, "reward_change_std": 0.33652111142873764, "reward_std": 0.6129091084003448, "rewards/cosine_scaled_reward": -0.16692609898746014, "rewards/format_reward": 0.6666666772216558, "step": 215 }, { "advantage_max": 1.7821224480867386, "advantage_mean": -1.1175871117430347e-08, "advantage_min": -0.7689832858741283, "advantage_std": 0.9388571158051491, "completion_length": 2343.37508392334, "epoch": 0.24685714285714286, "grad_norm": 0.5186208486557007, "kl": 0.25201416015625, "lambda_div_used": 0.5, "learning_rate": 7.301570646506027e-07, "loss": 0.0608, "reward": 0.16585935093462467, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.16585935093462467, "reward_after_std": 0.9388571009039879, "reward_before_mean": 0.8218697644770145, "reward_before_std": 0.8792456723749638, "reward_change_max": 0.0, "reward_change_mean": -0.6560104191303253, "reward_change_min": -1.2858059257268906, "reward_change_std": 0.4733603745698929, "reward_std": 0.9388571158051491, "rewards/cosine_scaled_reward": 0.004684882238507271, "rewards/format_reward": 0.8125000223517418, "step": 216 }, { "advantage_max": 1.7403410822153091, "advantage_mean": -5.5879355587151736e-09, "advantage_min": -0.6725828759372234, "advantage_std": 0.9263965599238873, "completion_length": 2844.8750762939453, "epoch": 0.248, "grad_norm": 0.47382375597953796, "kl": 0.3302001953125, "lambda_div_used": 0.5, "learning_rate": 7.27273859315928e-07, "loss": 0.0358, "reward": -0.06096999440342188, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.06096999440342188, "reward_after_std": 0.9263965785503387, "reward_before_mean": 0.4143219366669655, "reward_before_std": 0.9496859908103943, "reward_change_max": 0.001133456826210022, "reward_change_mean": -0.47529190964996815, "reward_change_min": -1.12648393958807, "reward_change_std": 0.4290233626961708, "reward_std": 0.9263966307044029, "rewards/cosine_scaled_reward": -0.03242238308303058, "rewards/format_reward": 0.47916667349636555, "step": 217 }, { "advantage_max": 1.569477766752243, "advantage_mean": -2.220446049250313e-16, "advantage_min": -0.5702780596911907, "advantage_std": 0.8123864158987999, "completion_length": 2503.3333587646484, "epoch": 0.24914285714285714, "grad_norm": 0.25727400183677673, "kl": 0.233642578125, "lambda_div_used": 0.5, "learning_rate": 7.243820139034464e-07, "loss": 0.0315, "reward": -0.2071865415200591, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2071865415200591, "reward_after_std": 0.8123864233493805, "reward_before_mean": 0.1725800707936287, "reward_before_std": 0.785517480224371, "reward_change_max": 0.002965576946735382, "reward_change_mean": -0.3797666160389781, "reward_change_min": -0.8266499191522598, "reward_change_std": 0.30547447595745325, "reward_std": 0.8123864307999611, "rewards/cosine_scaled_reward": -0.11162663483992219, "rewards/format_reward": 0.3958333358168602, "step": 218 }, { "advantage_max": 1.4346892908215523, "advantage_mean": 1.6142925107764938e-08, "advantage_min": -0.6690775826573372, "advantage_std": 0.7566139325499535, "completion_length": 2354.4792404174805, "epoch": 0.2502857142857143, "grad_norm": 0.38650956749916077, "kl": 0.282867431640625, "lambda_div_used": 0.5, "learning_rate": 7.214816693576234e-07, "loss": 0.0173, "reward": 0.017466269433498383, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.017466269433498383, "reward_after_std": 0.7566139176487923, "reward_before_mean": 0.6082219113595784, "reward_before_std": 0.6809872947633266, "reward_change_max": 0.00339333713054657, "reward_change_mean": -0.5907555720768869, "reward_change_min": -1.0609471648931503, "reward_change_std": 0.4097218685783446, "reward_std": 0.7566139437258244, "rewards/cosine_scaled_reward": -0.029222410172224045, "rewards/format_reward": 0.6666666753590107, "step": 219 }, { "advantage_max": 0.955601155757904, "advantage_mean": 1.0554989549049765e-08, "advantage_min": -0.40321509912610054, "advantage_std": 0.50310243293643, "completion_length": 2792.3333740234375, "epoch": 0.25142857142857145, "grad_norm": 0.41683459281921387, "kl": 0.364990234375, "lambda_div_used": 0.5, "learning_rate": 7.185729670371604e-07, "loss": 0.031, "reward": -0.43255934678018093, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.43255934678018093, "reward_after_std": 0.5031024254858494, "reward_before_mean": -0.139401210937649, "reward_before_std": 0.47662991285324097, "reward_change_max": 0.0028152763843536377, "reward_change_mean": -0.2931581400334835, "reward_change_min": -0.542736854404211, "reward_change_std": 0.22856017015874386, "reward_std": 0.5031024366617203, "rewards/cosine_scaled_reward": -0.26761728897690773, "rewards/format_reward": 0.39583333767950535, "step": 220 }, { "advantage_max": 1.7276756018400192, "advantage_mean": -4.967053768289986e-09, "advantage_min": -0.8304614424705505, "advantage_std": 0.9121339544653893, "completion_length": 2324.187545776367, "epoch": 0.25257142857142856, "grad_norm": 0.6268060207366943, "kl": 0.3243408203125, "lambda_div_used": 0.5, "learning_rate": 7.156560487081051e-07, "loss": 0.0542, "reward": 0.11553898081183434, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.11553898081183434, "reward_after_std": 0.9121339917182922, "reward_before_mean": 0.7398845301941037, "reward_before_std": 0.8802719376981258, "reward_change_max": 0.0006899982690811157, "reward_change_mean": -0.62434555683285, "reward_change_min": -1.1505605317652225, "reward_change_std": 0.4582773372530937, "reward_std": 0.9121340066194534, "rewards/cosine_scaled_reward": 0.04702560231089592, "rewards/format_reward": 0.6458333469927311, "step": 221 }, { "advantage_max": 1.26497058942914, "advantage_mean": -1.1796752907855534e-08, "advantage_min": -0.5657484494149685, "advantage_std": 0.6593565940856934, "completion_length": 2305.333366394043, "epoch": 0.2537142857142857, "grad_norm": 0.3454340398311615, "kl": 0.3505859375, "lambda_div_used": 0.5, "learning_rate": 7.127310565369415e-07, "loss": 0.0358, "reward": 0.044890944845974445, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.044890944845974445, "reward_after_std": 0.6593566089868546, "reward_before_mean": 0.6905238628387451, "reward_before_std": 0.5432622786611319, "reward_change_max": 0.00029180198907852173, "reward_change_mean": -0.645632941275835, "reward_change_min": -1.049218151718378, "reward_change_std": 0.39333911798894405, "reward_std": 0.6593566127121449, "rewards/cosine_scaled_reward": 0.022345258854329586, "rewards/format_reward": 0.645833333954215, "step": 222 }, { "advantage_max": 1.798129253089428, "advantage_mean": 4.967053768289986e-09, "advantage_min": -0.8689031004905701, "advantage_std": 0.950947355479002, "completion_length": 2518.166732788086, "epoch": 0.25485714285714284, "grad_norm": 0.621035635471344, "kl": 0.40380859375, "lambda_div_used": 0.5, "learning_rate": 7.097981330836616e-07, "loss": 0.0301, "reward": 0.011060059070587158, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.011060059070587158, "reward_after_std": 0.9509473778307438, "reward_before_mean": 0.5413750989828259, "reward_before_std": 0.9515966884791851, "reward_change_max": 0.0010984092950820923, "reward_change_mean": -0.5303150387480855, "reward_change_min": -0.9885171167552471, "reward_change_std": 0.4150323858484626, "reward_std": 0.9509474150836468, "rewards/cosine_scaled_reward": -0.02097912272438407, "rewards/format_reward": 0.5833333488553762, "step": 223 }, { "advantage_max": 1.782409906387329, "advantage_mean": 2.4835267176115394e-09, "advantage_min": -0.6734658181667328, "advantage_std": 0.9314799420535564, "completion_length": 2908.7084045410156, "epoch": 0.256, "grad_norm": 0.5879162549972534, "kl": 0.42041015625, "lambda_div_used": 0.5, "learning_rate": 7.068574212948169e-07, "loss": 0.0696, "reward": -0.10128612630069256, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10128612630069256, "reward_after_std": 0.9314799383282661, "reward_before_mean": 0.3314627211075276, "reward_before_std": 0.9180979616940022, "reward_change_max": 0.0, "reward_change_mean": -0.4327488373965025, "reward_change_min": -0.9870041385293007, "reward_change_std": 0.37257966212928295, "reward_std": 0.9314799644052982, "rewards/cosine_scaled_reward": -0.09468531236052513, "rewards/format_reward": 0.520833345130086, "step": 224 }, { "advantage_max": 0.9428818374872208, "advantage_mean": 1.2417638028949796e-09, "advantage_min": -0.5349617823958397, "advantage_std": 0.5179965812712908, "completion_length": 3046.437545776367, "epoch": 0.2571428571428571, "grad_norm": 0.8561449646949768, "kl": 0.4515380859375, "lambda_div_used": 0.5, "learning_rate": 7.039090644965509e-07, "loss": 0.0212, "reward": -0.3602433856576681, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.3602433856576681, "reward_after_std": 0.5179965812712908, "reward_before_mean": -0.0025209784507751465, "reward_before_std": 0.5427715508267283, "reward_change_max": 0.0024244189262390137, "reward_change_mean": -0.35772242583334446, "reward_change_min": -0.6620992906391621, "reward_change_std": 0.28696852759458125, "reward_std": 0.5179965924471617, "rewards/cosine_scaled_reward": -0.18876048736274242, "rewards/format_reward": 0.37500000931322575, "step": 225 }, { "advantage_max": 1.444564439356327, "advantage_mean": -4.967053546245381e-09, "advantage_min": -0.5781576111912727, "advantage_std": 0.7491228319704533, "completion_length": 2677.2083740234375, "epoch": 0.2582857142857143, "grad_norm": 0.5025551915168762, "kl": 0.4019775390625, "lambda_div_used": 0.5, "learning_rate": 7.009532063876148e-07, "loss": 0.0154, "reward": 0.07985361525788903, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.07985361525788903, "reward_after_std": 0.7491228580474854, "reward_before_mean": 0.725711066275835, "reward_before_std": 0.6195245683193207, "reward_change_max": 8.346140384674072e-05, "reward_change_mean": -0.6458574496209621, "reward_change_min": -1.0521031320095062, "reward_change_std": 0.40322335436940193, "reward_std": 0.7491228729486465, "rewards/cosine_scaled_reward": 0.07118885964155197, "rewards/format_reward": 0.583333333954215, "step": 226 }, { "advantage_max": 1.6421142667531967, "advantage_mean": -4.346171977864799e-09, "advantage_min": -0.7257067114114761, "advantage_std": 0.858584251254797, "completion_length": 2422.666702270508, "epoch": 0.25942857142857145, "grad_norm": 0.6199707388877869, "kl": 0.35296630859375, "lambda_div_used": 0.5, "learning_rate": 6.979899910323624e-07, "loss": 0.008, "reward": -0.09044338576495647, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.09044338576495647, "reward_after_std": 0.858584251254797, "reward_before_mean": 0.3791947956196964, "reward_before_std": 0.8184922076761723, "reward_change_max": 0.0029807984828948975, "reward_change_mean": -0.4696381874382496, "reward_change_min": -0.9254156686365604, "reward_change_std": 0.3720104694366455, "reward_std": 0.8585842587053776, "rewards/cosine_scaled_reward": -0.13331927731633186, "rewards/format_reward": 0.6458333507180214, "step": 227 }, { "advantage_max": 1.3848458677530289, "advantage_mean": -1.862645149230957e-09, "advantage_min": -0.7087922766804695, "advantage_std": 0.7344786264002323, "completion_length": 2385.395866394043, "epoch": 0.26057142857142856, "grad_norm": 0.30610209703445435, "kl": 0.3045654296875, "lambda_div_used": 0.5, "learning_rate": 6.950195628537299e-07, "loss": 0.0351, "reward": -0.0021043140441179276, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0021043140441179276, "reward_after_std": 0.7344786264002323, "reward_before_mean": 0.5830968823283911, "reward_before_std": 0.6933562718331814, "reward_change_max": 0.0014819428324699402, "reward_change_mean": -0.585201189853251, "reward_change_min": -1.0121977366507053, "reward_change_std": 0.4014543369412422, "reward_std": 0.7344786338508129, "rewards/cosine_scaled_reward": 0.02071509137749672, "rewards/format_reward": 0.5416666734963655, "step": 228 }, { "advantage_max": 1.376700833439827, "advantage_mean": 1.3038516877283968e-08, "advantage_min": -0.6517984047532082, "advantage_std": 0.731338307261467, "completion_length": 2883.5209350585938, "epoch": 0.26171428571428573, "grad_norm": 0.6094762682914734, "kl": 0.408447265625, "lambda_div_used": 0.5, "learning_rate": 6.920420666261961e-07, "loss": 0.0149, "reward": -0.10475949943065643, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10475949943065643, "reward_after_std": 0.7313383035361767, "reward_before_mean": 0.39478362910449505, "reward_before_std": 0.7087099254131317, "reward_change_max": 3.308802843093872e-05, "reward_change_mean": -0.49954311922192574, "reward_change_min": -0.9209417179226875, "reward_change_std": 0.38150350376963615, "reward_std": 0.7313383109867573, "rewards/cosine_scaled_reward": -0.031774863600730896, "rewards/format_reward": 0.4583333469927311, "step": 229 }, { "advantage_max": 1.37582515925169, "advantage_mean": -3.725290520506519e-09, "advantage_min": -0.5348953269422054, "advantage_std": 0.7181179635226727, "completion_length": 3124.541717529297, "epoch": 0.26285714285714284, "grad_norm": 0.5313109755516052, "kl": 0.381103515625, "lambda_div_used": 0.5, "learning_rate": 6.890576474687263e-07, "loss": 0.0481, "reward": -0.29289132729172707, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.29289132729172707, "reward_after_std": 0.7181179486215115, "reward_before_mean": 0.049733877182006836, "reward_before_std": 0.7073988057672977, "reward_change_max": 0.0, "reward_change_mean": -0.342625193297863, "reward_change_min": -0.7525230571627617, "reward_change_std": 0.287032725289464, "reward_std": 0.718117967247963, "rewards/cosine_scaled_reward": -0.15221640653908253, "rewards/format_reward": 0.3541666679084301, "step": 230 }, { "advantage_max": 1.051845483481884, "advantage_mean": 2.3593505704688766e-08, "advantage_min": -0.5571013279259205, "advantage_std": 0.5643906779587269, "completion_length": 2762.500045776367, "epoch": 0.264, "grad_norm": 0.33200061321258545, "kl": 0.290496826171875, "lambda_div_used": 0.5, "learning_rate": 6.860664508377001e-07, "loss": 0.0513, "reward": -0.24098680447787046, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.24098680447787046, "reward_after_std": 0.5643906742334366, "reward_before_mean": 0.20136728300713003, "reward_before_std": 0.5432005859911442, "reward_change_max": 0.0009815171360969543, "reward_change_mean": -0.4423540476709604, "reward_change_min": -0.7840287238359451, "reward_change_std": 0.31915647722780704, "reward_std": 0.5643906779587269, "rewards/cosine_scaled_reward": -0.11806636117398739, "rewards/format_reward": 0.43750000558793545, "step": 231 }, { "advantage_max": 1.15317015722394, "advantage_mean": 1.3038516488705909e-08, "advantage_min": -0.444794662296772, "advantage_std": 0.598000954836607, "completion_length": 2860.708450317383, "epoch": 0.2651428571428571, "grad_norm": 0.2935395836830139, "kl": 0.30963134765625, "lambda_div_used": 0.5, "learning_rate": 6.83068622519821e-07, "loss": 0.0267, "reward": -0.4222787544131279, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.4222787544131279, "reward_after_std": 0.5980009585618973, "reward_before_mean": -0.1471138414926827, "reward_before_std": 0.5847893413156271, "reward_change_max": 0.0004475414752960205, "reward_change_mean": -0.27516489988192916, "reward_change_min": -0.580917950719595, "reward_change_std": 0.23027911875396967, "reward_std": 0.5980009716004133, "rewards/cosine_scaled_reward": -0.2402235958725214, "rewards/format_reward": 0.3333333395421505, "step": 232 }, { "advantage_max": 1.4307596236467361, "advantage_mean": 7.45058065243498e-09, "advantage_min": -0.5155288130044937, "advantage_std": 0.7287798225879669, "completion_length": 2711.333427429199, "epoch": 0.2662857142857143, "grad_norm": 0.37518948316574097, "kl": 0.260467529296875, "lambda_div_used": 0.5, "learning_rate": 6.800643086250121e-07, "loss": 0.0252, "reward": -0.16911163227632642, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.16911163227632642, "reward_after_std": 0.7287798337638378, "reward_before_mean": 0.271101389080286, "reward_before_std": 0.6243734955787659, "reward_change_max": 0.0, "reward_change_mean": -0.4402130162343383, "reward_change_min": -0.7976448684930801, "reward_change_std": 0.3009780514985323, "reward_std": 0.7287798523902893, "rewards/cosine_scaled_reward": -0.17694931849837303, "rewards/format_reward": 0.6250000149011612, "step": 233 }, { "advantage_max": 1.38912433385849, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.5794681049883366, "advantage_std": 0.7250793315470219, "completion_length": 2647.0208854675293, "epoch": 0.2674285714285714, "grad_norm": 0.3574603796005249, "kl": 0.2249755859375, "lambda_div_used": 0.5, "learning_rate": 6.770536555792944e-07, "loss": 0.0235, "reward": -0.2084459774196148, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.2084459774196148, "reward_after_std": 0.7250793501734734, "reward_before_mean": 0.20548728946596384, "reward_before_std": 0.6992978528141975, "reward_change_max": 0.0009614154696464539, "reward_change_mean": -0.4139332454651594, "reward_change_min": -0.8338779956102371, "reward_change_std": 0.32731484808027744, "reward_std": 0.7250793538987637, "rewards/cosine_scaled_reward": -0.09517304040491581, "rewards/format_reward": 0.39583333767950535, "step": 234 }, { "advantage_max": 1.5219962149858475, "advantage_mean": -1.117587211663107e-08, "advantage_min": -0.5729375965893269, "advantage_std": 0.7900757826864719, "completion_length": 2313.7083892822266, "epoch": 0.26857142857142857, "grad_norm": 0.30398738384246826, "kl": 0.222442626953125, "lambda_div_used": 0.5, "learning_rate": 6.740368101176495e-07, "loss": 0.0332, "reward": -0.08736011805012822, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.08736011805012822, "reward_after_std": 0.7900757752358913, "reward_before_mean": 0.403134074062109, "reward_before_std": 0.7198442071676254, "reward_change_max": 0.0, "reward_change_mean": -0.4904941339045763, "reward_change_min": -0.928714144974947, "reward_change_std": 0.34090583585202694, "reward_std": 0.7900757789611816, "rewards/cosine_scaled_reward": -0.07968299090862274, "rewards/format_reward": 0.5625000074505806, "step": 235 }, { "advantage_max": 2.1206346452236176, "advantage_mean": -1.428027990302283e-08, "advantage_min": -0.9444840997457504, "advantage_std": 1.131901353597641, "completion_length": 2942.8959045410156, "epoch": 0.26971428571428574, "grad_norm": 1.6650360822677612, "kl": 0.2109375, "lambda_div_used": 0.5, "learning_rate": 6.710139192768694e-07, "loss": 0.0798, "reward": 0.06566341919824481, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06566341919824481, "reward_after_std": 1.1319013722240925, "reward_before_mean": 0.585906186606735, "reward_before_std": 1.19473173096776, "reward_change_max": 0.00039448589086532593, "reward_change_mean": -0.520242765545845, "reward_change_min": -1.2649845704436302, "reward_change_std": 0.5010114163160324, "reward_std": 1.1319014057517052, "rewards/cosine_scaled_reward": 0.022119746543467045, "rewards/format_reward": 0.541666679084301, "step": 236 }, { "advantage_max": 1.702420450747013, "advantage_mean": 1.2417634476236117e-08, "advantage_min": -0.7322128117084503, "advantage_std": 0.8832337111234665, "completion_length": 2699.979217529297, "epoch": 0.27085714285714285, "grad_norm": 0.5813829302787781, "kl": 0.23004150390625, "lambda_div_used": 0.5, "learning_rate": 6.679851303883891e-07, "loss": -0.0006, "reward": 0.06363435182720423, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06363435182720423, "reward_after_std": 0.8832337185740471, "reward_before_mean": 0.651249460875988, "reward_before_std": 0.80680762976408, "reward_change_max": 0.0011894777417182922, "reward_change_mean": -0.5876150969415903, "reward_change_min": -1.0081750489771366, "reward_change_std": 0.4123306255787611, "reward_std": 0.8832337334752083, "rewards/cosine_scaled_reward": 0.06520804762840271, "rewards/format_reward": 0.5208333507180214, "step": 237 }, { "advantage_max": 1.7350734397768974, "advantage_mean": -1.614292466367573e-08, "advantage_min": -0.8302115723490715, "advantage_std": 0.9283706545829773, "completion_length": 2385.9166870117188, "epoch": 0.272, "grad_norm": 0.5285000205039978, "kl": 0.22991943359375, "lambda_div_used": 0.5, "learning_rate": 6.649505910711058e-07, "loss": -0.0073, "reward": 0.16460101958364248, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16460101958364248, "reward_after_std": 0.9283706694841385, "reward_before_mean": 0.8287704335525632, "reward_before_std": 0.8974343203008175, "reward_change_max": 0.003822311758995056, "reward_change_mean": -0.6641694214195013, "reward_change_min": -1.260919313877821, "reward_change_std": 0.5075275972485542, "reward_std": 0.9283707290887833, "rewards/cosine_scaled_reward": 0.13313521444797516, "rewards/format_reward": 0.5625000149011612, "step": 238 }, { "advantage_max": 1.426647413522005, "advantage_mean": 3.7252898543727042e-09, "advantage_min": -0.5672443434596062, "advantage_std": 0.7399463746696711, "completion_length": 2061.166706085205, "epoch": 0.27314285714285713, "grad_norm": 0.21509422361850739, "kl": 0.182098388671875, "lambda_div_used": 0.5, "learning_rate": 6.619104492241847e-07, "loss": 0.0163, "reward": 0.21323653869330883, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21323653869330883, "reward_after_std": 0.7399463709443808, "reward_before_mean": 0.9677451644092798, "reward_before_std": 0.576780516654253, "reward_change_max": 0.0005485713481903076, "reward_change_mean": -0.7545085661113262, "reward_change_min": -1.1895010620355606, "reward_change_std": 0.44879256654530764, "reward_std": 0.7399463932961226, "rewards/cosine_scaled_reward": 0.16095588821917772, "rewards/format_reward": 0.6458333414047956, "step": 239 }, { "advantage_max": 1.2942400351166725, "advantage_mean": 3.104408563547878e-09, "advantage_min": -0.5551019683480263, "advantage_std": 0.6765760257840157, "completion_length": 2909.104232788086, "epoch": 0.2742857142857143, "grad_norm": 0.4627504348754883, "kl": 0.38671875, "lambda_div_used": 0.5, "learning_rate": 6.588648530198504e-07, "loss": 0.0268, "reward": -0.2009472165373154, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.2009472165373154, "reward_after_std": 0.6765760257840157, "reward_before_mean": 0.23628009483218193, "reward_before_std": 0.6305925287306309, "reward_change_max": 0.0005473867058753967, "reward_change_mean": -0.437227301299572, "reward_change_min": -0.8425621017813683, "reward_change_std": 0.3167913742363453, "reward_std": 0.676576055586338, "rewards/cosine_scaled_reward": -0.18394330446608365, "rewards/format_reward": 0.604166679084301, "step": 240 }, { "advantage_max": 0.9308176077902317, "advantage_mean": 1.8626452213954536e-08, "advantage_min": -0.3792998418211937, "advantage_std": 0.4861900471150875, "completion_length": 2913.6875, "epoch": 0.2754285714285714, "grad_norm": 0.4878361225128174, "kl": 0.33270263671875, "lambda_div_used": 0.5, "learning_rate": 6.558139508961654e-07, "loss": 0.0172, "reward": -0.40733741596341133, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.40733741596341133, "reward_after_std": 0.4861900359392166, "reward_before_mean": -0.08301575854420662, "reward_before_std": 0.4412674307823181, "reward_change_max": 0.0004919767379760742, "reward_change_mean": -0.32432164903730154, "reward_change_min": -0.6432442888617516, "reward_change_std": 0.23452804517000914, "reward_std": 0.4861900471150875, "rewards/cosine_scaled_reward": -0.2602578904479742, "rewards/format_reward": 0.4375000037252903, "step": 241 }, { "advantage_max": 1.4320058524608612, "advantage_mean": -2.4835267731226907e-09, "advantage_min": -0.6539147347211838, "advantage_std": 0.7636258415877819, "completion_length": 2382.5417404174805, "epoch": 0.2765714285714286, "grad_norm": 0.5571370124816895, "kl": 0.2674560546875, "lambda_div_used": 0.5, "learning_rate": 6.527578915497951e-07, "loss": 0.0559, "reward": 0.047114765271544456, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.047114765271544456, "reward_after_std": 0.7636258564889431, "reward_before_mean": 0.6656512469053268, "reward_before_std": 0.7144070193171501, "reward_change_max": 0.0007030218839645386, "reward_change_mean": -0.6185364685952663, "reward_change_min": -1.1091222614049911, "reward_change_std": 0.42813634127378464, "reward_std": 0.7636258639395237, "rewards/cosine_scaled_reward": -0.0630077242385596, "rewards/format_reward": 0.7916666753590107, "step": 242 }, { "advantage_max": 1.5952362790703773, "advantage_mean": 1.7384688466570708e-08, "advantage_min": -0.58317955955863, "advantage_std": 0.8279558680951595, "completion_length": 2815.541702270508, "epoch": 0.2777142857142857, "grad_norm": 0.612875759601593, "kl": 0.308380126953125, "lambda_div_used": 0.5, "learning_rate": 6.496968239287603e-07, "loss": 0.0432, "reward": -0.06789615005254745, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.06789615005254745, "reward_after_std": 0.827955886721611, "reward_before_mean": 0.42719752667471766, "reward_before_std": 0.7438078261911869, "reward_change_max": 0.0016501322388648987, "reward_change_mean": -0.49509366787970066, "reward_change_min": -0.9931257180869579, "reward_change_std": 0.38069539703428745, "reward_std": 0.8279559127986431, "rewards/cosine_scaled_reward": -0.025984576670452952, "rewards/format_reward": 0.47916666977107525, "step": 243 }, { "advantage_max": 1.8111069053411484, "advantage_mean": -3.7252901874396116e-09, "advantage_min": -0.6616870537400246, "advantage_std": 0.9255795367062092, "completion_length": 2855.2083892822266, "epoch": 0.27885714285714286, "grad_norm": 0.47464779019355774, "kl": 0.323638916015625, "lambda_div_used": 0.5, "learning_rate": 6.466308972251785e-07, "loss": 0.0105, "reward": 0.07657577097415924, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07657577097415924, "reward_after_std": 0.9255795031785965, "reward_before_mean": 0.6568863705615513, "reward_before_std": 0.802655566483736, "reward_change_max": 0.00221802294254303, "reward_change_mean": -0.5803106501698494, "reward_change_min": -1.0821171216666698, "reward_change_std": 0.39801392890512943, "reward_std": 0.9255795180797577, "rewards/cosine_scaled_reward": 0.09927653288468719, "rewards/format_reward": 0.45833334140479565, "step": 244 }, { "advantage_max": 1.9345757067203522, "advantage_mean": 1.117587122845265e-08, "advantage_min": -0.8029041439294815, "advantage_std": 1.0185250714421272, "completion_length": 2893.0209350585938, "epoch": 0.28, "grad_norm": 1.1499474048614502, "kl": 0.329345703125, "lambda_div_used": 0.5, "learning_rate": 6.435602608679916e-07, "loss": 0.0769, "reward": -0.11789725301787257, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.11789725301787257, "reward_after_std": 1.0185250639915466, "reward_before_mean": 0.27830402879044414, "reward_before_std": 1.0572170242667198, "reward_change_max": 0.00021164864301681519, "reward_change_mean": -0.3962012715637684, "reward_change_min": -1.0164557546377182, "reward_change_std": 0.3989331964403391, "reward_std": 1.0185250863432884, "rewards/cosine_scaled_reward": -0.03793133102590218, "rewards/format_reward": 0.35416667722165585, "step": 245 }, { "advantage_max": 1.797962486743927, "advantage_mean": -1.6763806787167823e-08, "advantage_min": -0.8499506339430809, "advantage_std": 0.9726065471768379, "completion_length": 2728.291763305664, "epoch": 0.28114285714285714, "grad_norm": 0.7932919263839722, "kl": 0.32568359375, "lambda_div_used": 0.5, "learning_rate": 6.404850645156841e-07, "loss": 0.0096, "reward": 0.08054419793188572, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.08054419793188572, "reward_after_std": 0.9726065471768379, "reward_before_mean": 0.6667746864259243, "reward_before_std": 1.0122821666300297, "reward_change_max": 0.0, "reward_change_mean": -0.5862304829061031, "reward_change_min": -1.2132366746664047, "reward_change_std": 0.4926592092961073, "reward_std": 0.9726065844297409, "rewards/cosine_scaled_reward": 0.041720665991306305, "rewards/format_reward": 0.5833333525806665, "step": 246 }, { "advantage_max": 1.1503918841481209, "advantage_mean": 9.313226134732844e-09, "advantage_min": -0.5107267498970032, "advantage_std": 0.624145220965147, "completion_length": 2959.5625610351562, "epoch": 0.2822857142857143, "grad_norm": 0.2704883813858032, "kl": 0.33203125, "lambda_div_used": 0.5, "learning_rate": 6.374054580489873e-07, "loss": 0.027, "reward": -0.28593634255230427, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.28593634255230427, "reward_after_std": 0.6241452284157276, "reward_before_mean": 0.09957591397687793, "reward_before_std": 0.6440289784222841, "reward_change_max": 0.0004436373710632324, "reward_change_mean": -0.3855122644454241, "reward_change_min": -0.8384397625923157, "reward_change_std": 0.3347723223268986, "reward_std": 0.6241452470421791, "rewards/cosine_scaled_reward": -0.22104538418352604, "rewards/format_reward": 0.5416666772216558, "step": 247 }, { "advantage_max": 1.7827886566519737, "advantage_mean": 1.2417632477834672e-09, "advantage_min": -0.7948522940278053, "advantage_std": 0.9226235747337341, "completion_length": 2637.9375762939453, "epoch": 0.2834285714285714, "grad_norm": 0.6064723134040833, "kl": 0.323699951171875, "lambda_div_used": 0.5, "learning_rate": 6.343215915635761e-07, "loss": 0.0602, "reward": 0.15817245468497276, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.15817245468497276, "reward_after_std": 0.9226235896348953, "reward_before_mean": 0.8102723509073257, "reward_before_std": 0.8119069188833237, "reward_change_max": 0.0, "reward_change_mean": -0.652099872007966, "reward_change_min": -1.090793576091528, "reward_change_std": 0.43665625154972076, "reward_std": 0.9226236119866371, "rewards/cosine_scaled_reward": 0.08221947122365236, "rewards/format_reward": 0.6458333432674408, "step": 248 }, { "advantage_max": 1.5615429654717445, "advantage_mean": -9.934107314535368e-09, "advantage_min": -0.7385703772306442, "advantage_std": 0.8267139345407486, "completion_length": 2314.9584045410156, "epoch": 0.2845714285714286, "grad_norm": 0.5855349898338318, "kl": 0.285888671875, "lambda_div_used": 0.5, "learning_rate": 6.31233615362752e-07, "loss": 0.0494, "reward": 0.21203571744263172, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21203571744263172, "reward_after_std": 0.8267139345407486, "reward_before_mean": 0.9464347688481212, "reward_before_std": 0.7446025982499123, "reward_change_max": 0.0, "reward_change_mean": -0.7343990430235863, "reward_change_min": -1.168241087347269, "reward_change_std": 0.47521305456757545, "reward_std": 0.8267139419913292, "rewards/cosine_scaled_reward": 0.15030069323256612, "rewards/format_reward": 0.6458333432674408, "step": 249 }, { "advantage_max": 1.6400514990091324, "advantage_mean": 6.208815128694312e-10, "advantage_min": -0.6048454195261002, "advantage_std": 0.8398489728569984, "completion_length": 2437.979248046875, "epoch": 0.2857142857142857, "grad_norm": 0.5643945336341858, "kl": 0.34326171875, "lambda_div_used": 0.5, "learning_rate": 6.281416799501187e-07, "loss": 0.0226, "reward": -0.026893689762800932, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.026893689762800932, "reward_after_std": 0.8398489952087402, "reward_before_mean": 0.49545107781887054, "reward_before_std": 0.7367004603147507, "reward_change_max": 0.0, "reward_change_mean": -0.5223447624593973, "reward_change_min": -0.9968926385045052, "reward_change_std": 0.3599364850670099, "reward_std": 0.8398490101099014, "rewards/cosine_scaled_reward": -0.13769113458693027, "rewards/format_reward": 0.770833345130086, "step": 250 }, { "advantage_max": 1.7156466022133827, "advantage_mean": -8.692344455329959e-09, "advantage_min": -0.6233803145587444, "advantage_std": 0.8830557502806187, "completion_length": 2032.9167175292969, "epoch": 0.28685714285714287, "grad_norm": 0.5100935101509094, "kl": 0.2845458984375, "lambda_div_used": 0.5, "learning_rate": 6.25045936022246e-07, "loss": 0.0108, "reward": 0.15881825191900134, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.15881825191900134, "reward_after_std": 0.8830557428300381, "reward_before_mean": 0.8191157560795546, "reward_before_std": 0.7504131086170673, "reward_change_max": 0.0, "reward_change_mean": -0.6602975130081177, "reward_change_min": -1.1482919603586197, "reward_change_std": 0.4236573148518801, "reward_std": 0.8830557651817799, "rewards/cosine_scaled_reward": 0.013724527321755886, "rewards/format_reward": 0.7916666772216558, "step": 251 }, { "advantage_max": 1.3824920132756233, "advantage_mean": -7.450580263856921e-09, "advantage_min": -0.5751038901507854, "advantage_std": 0.7163071185350418, "completion_length": 2569.6042098999023, "epoch": 0.288, "grad_norm": 0.44910934567451477, "kl": 0.32745361328125, "lambda_div_used": 0.5, "learning_rate": 6.219465344613258e-07, "loss": 0.0476, "reward": -0.045434391126036644, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.045434391126036644, "reward_after_std": 0.7163071371614933, "reward_before_mean": 0.5019231028854847, "reward_before_std": 0.6015528850257397, "reward_change_max": 0.0, "reward_change_mean": -0.5473575107753277, "reward_change_min": -0.9850481189787388, "reward_change_std": 0.3646436370909214, "reward_std": 0.7163071446120739, "rewards/cosine_scaled_reward": -0.0719551183283329, "rewards/format_reward": 0.6458333544433117, "step": 252 }, { "advantage_max": 1.5739115923643112, "advantage_mean": 1.7074247515846963e-08, "advantage_min": -0.7448620498180389, "advantage_std": 0.8350856080651283, "completion_length": 2185.479217529297, "epoch": 0.28914285714285715, "grad_norm": 0.6235557794570923, "kl": 0.2657470703125, "lambda_div_used": 0.5, "learning_rate": 6.188436263278172e-07, "loss": 0.0477, "reward": 0.13030250370502472, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.13030250370502472, "reward_after_std": 0.8350856006145477, "reward_before_mean": 0.7909778542816639, "reward_before_std": 0.7733752019703388, "reward_change_max": 0.0005941390991210938, "reward_change_mean": -0.660675348713994, "reward_change_min": -1.1640889719128609, "reward_change_std": 0.4511568062007427, "reward_std": 0.8350856229662895, "rewards/cosine_scaled_reward": 0.0100722536444664, "rewards/format_reward": 0.770833345130086, "step": 253 }, { "advantage_max": 1.4640011079609394, "advantage_mean": 1.0554989549049765e-08, "advantage_min": -0.575132817029953, "advantage_std": 0.7637783885002136, "completion_length": 2904.3125610351562, "epoch": 0.29028571428571426, "grad_norm": 0.6790186166763306, "kl": 0.49212646484375, "lambda_div_used": 0.5, "learning_rate": 6.157373628530852e-07, "loss": 0.044, "reward": -0.17827866226434708, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17827866226434708, "reward_after_std": 0.7637783773243427, "reward_before_mean": 0.24730434804223478, "reward_before_std": 0.7225446216762066, "reward_change_max": 0.0017682760953903198, "reward_change_mean": -0.42558303009718657, "reward_change_min": -0.908573143184185, "reward_change_std": 0.33383440785109997, "reward_std": 0.763778381049633, "rewards/cosine_scaled_reward": -0.1367644937708974, "rewards/format_reward": 0.5208333395421505, "step": 254 }, { "advantage_max": 1.595671109855175, "advantage_mean": 1.2417634698280722e-08, "advantage_min": -0.6261331886053085, "advantage_std": 0.820063129067421, "completion_length": 2718.9375610351562, "epoch": 0.2914285714285714, "grad_norm": 0.6525692939758301, "kl": 0.441162109375, "lambda_div_used": 0.5, "learning_rate": 6.126278954320294e-07, "loss": 0.0181, "reward": -0.21421317756175995, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21421317756175995, "reward_after_std": 0.8200631737709045, "reward_before_mean": 0.16194157907739282, "reward_before_std": 0.7840095274150372, "reward_change_max": 0.001371636986732483, "reward_change_mean": -0.3761547487229109, "reward_change_min": -0.768225908279419, "reward_change_std": 0.30263841338455677, "reward_std": 0.8200631812214851, "rewards/cosine_scaled_reward": -0.16902921721339226, "rewards/format_reward": 0.5000000093132257, "step": 255 }, { "advantage_max": 1.0734341964125633, "advantage_mean": -4.346171700309043e-09, "advantage_min": -0.56340616568923, "advantage_std": 0.5797542482614517, "completion_length": 2705.5625762939453, "epoch": 0.2925714285714286, "grad_norm": 0.5698035359382629, "kl": 0.4195556640625, "lambda_div_used": 0.5, "learning_rate": 6.095153756157051e-07, "loss": 0.0522, "reward": -0.21560884034261107, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.21560884034261107, "reward_after_std": 0.5797542706131935, "reward_before_mean": 0.24173666816204786, "reward_before_std": 0.5742171257734299, "reward_change_max": 4.9874186515808105e-05, "reward_change_mean": -0.457345524802804, "reward_change_min": -0.8107205480337143, "reward_change_std": 0.33175988495349884, "reward_std": 0.579754289239645, "rewards/cosine_scaled_reward": -0.12913167104125023, "rewards/format_reward": 0.5000000167638063, "step": 256 }, { "advantage_max": 1.8975205719470978, "advantage_mean": -2.483526828633842e-09, "advantage_min": -0.8254300951957703, "advantage_std": 1.0045362412929535, "completion_length": 2797.666748046875, "epoch": 0.2937142857142857, "grad_norm": 1.2031116485595703, "kl": 0.36602783203125, "lambda_div_used": 0.5, "learning_rate": 6.06399955103937e-07, "loss": 0.0641, "reward": 0.21093771699815989, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21093771699815989, "reward_after_std": 1.004536233842373, "reward_before_mean": 0.8886510282754898, "reward_before_std": 0.9414378330111504, "reward_change_max": 0.0012116432189941406, "reward_change_mean": -0.6777133084833622, "reward_change_min": -1.307423584163189, "reward_change_std": 0.5130571741610765, "reward_std": 1.0045362561941147, "rewards/cosine_scaled_reward": 0.07974217180162668, "rewards/format_reward": 0.729166679084301, "step": 257 }, { "advantage_max": 1.6636093333363533, "advantage_mean": 4.967053435223079e-09, "advantage_min": -0.8262095041573048, "advantage_std": 0.8977803438901901, "completion_length": 2957.854248046875, "epoch": 0.2948571428571429, "grad_norm": 0.6931731700897217, "kl": 0.4793701171875, "lambda_div_used": 0.5, "learning_rate": 6.032817857379256e-07, "loss": 0.0403, "reward": -0.015886036679148674, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.015886036679148674, "reward_after_std": 0.8977803215384483, "reward_before_mean": 0.5135537143796682, "reward_before_std": 0.9260082244873047, "reward_change_max": 0.0007776841521263123, "reward_change_mean": -0.5294397762045264, "reward_change_min": -1.0711536519229412, "reward_change_std": 0.4524089526385069, "reward_std": 0.8977803774178028, "rewards/cosine_scaled_reward": -0.014056478627026081, "rewards/format_reward": 0.5416666846722364, "step": 258 }, { "advantage_max": 1.886551707983017, "advantage_mean": -2.2351742234860694e-08, "advantage_min": -0.8040063604712486, "advantage_std": 0.991726316511631, "completion_length": 2334.0625610351562, "epoch": 0.296, "grad_norm": 0.7005351185798645, "kl": 0.38140869140625, "lambda_div_used": 0.5, "learning_rate": 6.001610194928464e-07, "loss": 0.0496, "reward": 0.23102650791406631, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.23102650791406631, "reward_after_std": 0.9917263463139534, "reward_before_mean": 0.9282313352450728, "reward_before_std": 0.9095796346664429, "reward_change_max": 0.002127528190612793, "reward_change_mean": -0.6972048059105873, "reward_change_min": -1.2518923357129097, "reward_change_std": 0.4892759174108505, "reward_std": 0.9917263612151146, "rewards/cosine_scaled_reward": 0.07869898644275963, "rewards/format_reward": 0.7708333432674408, "step": 259 }, { "advantage_max": 1.7901442348957062, "advantage_mean": -2.6077033421501028e-08, "advantage_min": -0.8769202791154385, "advantage_std": 0.9542439803481102, "completion_length": 1963.6667022705078, "epoch": 0.29714285714285715, "grad_norm": 0.6481761932373047, "kl": 0.27838134765625, "lambda_div_used": 0.5, "learning_rate": 5.97037808470444e-07, "loss": 0.0426, "reward": 0.2789585944265127, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2789585944265127, "reward_after_std": 0.9542439877986908, "reward_before_mean": 1.0259063299745321, "reward_before_std": 0.9065537489950657, "reward_change_max": 0.0016413480043411255, "reward_change_mean": -0.7469477131962776, "reward_change_min": -1.3164174929261208, "reward_change_std": 0.5313061438500881, "reward_std": 0.9542440101504326, "rewards/cosine_scaled_reward": 0.11711981240659952, "rewards/format_reward": 0.7916666716337204, "step": 260 }, { "advantage_max": 1.4177104532718658, "advantage_mean": -1.1102230246251565e-16, "advantage_min": -0.6767967715859413, "advantage_std": 0.7486027590930462, "completion_length": 2660.0834197998047, "epoch": 0.29828571428571427, "grad_norm": 0.4558544158935547, "kl": 0.396484375, "lambda_div_used": 0.5, "learning_rate": 5.939123048916173e-07, "loss": 0.0347, "reward": -0.09252565540373325, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.09252565540373325, "reward_after_std": 0.7486027553677559, "reward_before_mean": 0.41056731529533863, "reward_before_std": 0.7132025845348835, "reward_change_max": 0.0010763555765151978, "reward_change_mean": -0.5030929781496525, "reward_change_min": -0.8874324820935726, "reward_change_std": 0.3634376022964716, "reward_std": 0.7486027553677559, "rewards/cosine_scaled_reward": -0.09679968375712633, "rewards/format_reward": 0.6041666809469461, "step": 261 }, { "advantage_max": 1.3604392558336258, "advantage_mean": 7.1401400625337175e-09, "advantage_min": -0.610820833593607, "advantage_std": 0.7167367935180664, "completion_length": 2684.2709350585938, "epoch": 0.29942857142857143, "grad_norm": 0.5589156150817871, "kl": 0.40673828125, "lambda_div_used": 0.5, "learning_rate": 5.907846610890011e-07, "loss": 0.0415, "reward": -0.1675438095408026, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1675438095408026, "reward_after_std": 0.716736800968647, "reward_before_mean": 0.28486710973083973, "reward_before_std": 0.6902591176331043, "reward_change_max": 0.0013748407363891602, "reward_change_mean": -0.45241091772913933, "reward_change_min": -0.8601580671966076, "reward_change_std": 0.3375726994127035, "reward_std": 0.7167368233203888, "rewards/cosine_scaled_reward": -0.2117331251502037, "rewards/format_reward": 0.7083333469927311, "step": 262 }, { "advantage_max": 1.264581359922886, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.5539588704705238, "advantage_std": 0.6596848592162132, "completion_length": 2776.000030517578, "epoch": 0.30057142857142854, "grad_norm": 0.8647629618644714, "kl": 0.4727783203125, "lambda_div_used": 0.5, "learning_rate": 5.87655029499542e-07, "loss": 0.0256, "reward": -0.13164973491802812, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.13164973491802812, "reward_after_std": 0.6596848666667938, "reward_before_mean": 0.3626173473894596, "reward_before_std": 0.580600518733263, "reward_change_max": 0.0, "reward_change_mean": -0.4942670837044716, "reward_change_min": -0.8870192393660545, "reward_change_std": 0.32861434295773506, "reward_std": 0.6596848852932453, "rewards/cosine_scaled_reward": -0.1936913337558508, "rewards/format_reward": 0.750000013038516, "step": 263 }, { "advantage_max": 1.5834196358919144, "advantage_mean": -3.725290298461914e-09, "advantage_min": -0.6787533052265644, "advantage_std": 0.8481276035308838, "completion_length": 2627.8959350585938, "epoch": 0.3017142857142857, "grad_norm": 0.8860915303230286, "kl": 0.45947265625, "lambda_div_used": 0.5, "learning_rate": 5.845235626570683e-07, "loss": 0.0302, "reward": -0.08708051778376102, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08708051778376102, "reward_after_std": 0.8481276035308838, "reward_before_mean": 0.3910123445093632, "reward_before_std": 0.864212442189455, "reward_change_max": 0.0015831664204597473, "reward_change_mean": -0.47809283807873726, "reward_change_min": -1.0792249664664268, "reward_change_std": 0.40922513976693153, "reward_std": 0.8481276258826256, "rewards/cosine_scaled_reward": -0.10657717660069466, "rewards/format_reward": 0.6041666753590107, "step": 264 }, { "advantage_max": 1.3798726946115494, "advantage_mean": 1.6142925107764938e-08, "advantage_min": -0.7368720173835754, "advantage_std": 0.7367407977581024, "completion_length": 2043.7292022705078, "epoch": 0.3028571428571429, "grad_norm": 0.7481672763824463, "kl": 0.21441650390625, "lambda_div_used": 0.5, "learning_rate": 5.813904131848564e-07, "loss": -0.0053, "reward": 0.1203412376344204, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1203412376344204, "reward_after_std": 0.7367407903075218, "reward_before_mean": 0.8063299190253019, "reward_before_std": 0.6718975119292736, "reward_change_max": 0.0, "reward_change_mean": -0.6859886385500431, "reward_change_min": -1.1170197911560535, "reward_change_std": 0.44301604852080345, "reward_std": 0.7367408089339733, "rewards/cosine_scaled_reward": -0.003085056319832802, "rewards/format_reward": 0.812500013038516, "step": 265 }, { "advantage_max": 1.5312513262033463, "advantage_mean": 6.208816794028849e-10, "advantage_min": -0.7652738317847252, "advantage_std": 0.807264044880867, "completion_length": 2435.9375610351562, "epoch": 0.304, "grad_norm": 0.956333339214325, "kl": 0.31561279296875, "lambda_div_used": 0.5, "learning_rate": 5.78255733788191e-07, "loss": 0.0178, "reward": 0.039600093849003315, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.039600093849003315, "reward_after_std": 0.807264044880867, "reward_before_mean": 0.6330936271697283, "reward_before_std": 0.7511084713041782, "reward_change_max": 0.0, "reward_change_mean": -0.5934935286641121, "reward_change_min": -1.0697472617030144, "reward_change_std": 0.4119662679731846, "reward_std": 0.8072640635073185, "rewards/cosine_scaled_reward": -0.04803652781993151, "rewards/format_reward": 0.7291666865348816, "step": 266 }, { "advantage_max": 1.6614954099059105, "advantage_mean": -4.967053768289986e-09, "advantage_min": -0.6934118308126926, "advantage_std": 0.8682034313678741, "completion_length": 2963.6458892822266, "epoch": 0.30514285714285716, "grad_norm": 1.267866611480713, "kl": 0.4324951171875, "lambda_div_used": 0.5, "learning_rate": 5.751196772469237e-07, "loss": 0.0592, "reward": -0.19175553228706121, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.19175553228706121, "reward_after_std": 0.868203416466713, "reward_before_mean": 0.19106070883572102, "reward_before_std": 0.8646641299128532, "reward_change_max": 0.0, "reward_change_mean": -0.3828162420541048, "reward_change_min": -0.8564209714531898, "reward_change_std": 0.33900113217532635, "reward_std": 0.8682034537196159, "rewards/cosine_scaled_reward": -0.1753029921092093, "rewards/format_reward": 0.5416666734963655, "step": 267 }, { "advantage_max": 1.3772685006260872, "advantage_mean": 1.0554989382516311e-08, "advantage_min": -0.6027562022209167, "advantage_std": 0.7217709645628929, "completion_length": 2267.0208740234375, "epoch": 0.3062857142857143, "grad_norm": 0.48167508840560913, "kl": 0.257080078125, "lambda_div_used": 0.5, "learning_rate": 5.71982396408026e-07, "loss": 0.0281, "reward": -0.1674497053027153, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1674497053027153, "reward_after_std": 0.7217709608376026, "reward_before_mean": 0.2781257377937436, "reward_before_std": 0.6824806816875935, "reward_change_max": 7.049739360809326e-05, "reward_change_mean": -0.44557544589042664, "reward_change_min": -0.9256918206810951, "reward_change_std": 0.33537369780242443, "reward_std": 0.7217709757387638, "rewards/cosine_scaled_reward": -0.13177046133205295, "rewards/format_reward": 0.5416666734963655, "step": 268 }, { "advantage_max": 1.4905111193656921, "advantage_mean": 5.587935503204022e-09, "advantage_min": -0.5878796353936195, "advantage_std": 0.7800829708576202, "completion_length": 2680.1876068115234, "epoch": 0.30742857142857144, "grad_norm": 0.42294082045555115, "kl": 0.29730224609375, "lambda_div_used": 0.5, "learning_rate": 5.688440441781398e-07, "loss": 0.0065, "reward": -0.014324287883937359, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.014324287883937359, "reward_after_std": 0.7800829447805882, "reward_before_mean": 0.5407853480428457, "reward_before_std": 0.7013232558965683, "reward_change_max": 0.0007768869400024414, "reward_change_mean": -0.5551096498966217, "reward_change_min": -1.0575359836220741, "reward_change_std": 0.38940995931625366, "reward_std": 0.780082993209362, "rewards/cosine_scaled_reward": -0.12544066738337278, "rewards/format_reward": 0.7916666753590107, "step": 269 }, { "advantage_max": 1.7074644267559052, "advantage_mean": -7.45058115203534e-09, "advantage_min": -0.7014975696802139, "advantage_std": 0.886136669665575, "completion_length": 2692.854217529297, "epoch": 0.30857142857142855, "grad_norm": 0.725864589214325, "kl": 0.239715576171875, "lambda_div_used": 0.5, "learning_rate": 5.657047735161255e-07, "loss": 0.0403, "reward": 0.07836535479873419, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.07836535479873419, "reward_after_std": 0.8861366622149944, "reward_before_mean": 0.6759741138666868, "reward_before_std": 0.7980044074356556, "reward_change_max": 0.0, "reward_change_mean": -0.5976087264716625, "reward_change_min": -1.1347657963633537, "reward_change_std": 0.40823423117399216, "reward_std": 0.8861366920173168, "rewards/cosine_scaled_reward": -0.037012950982898474, "rewards/format_reward": 0.7500000260770321, "step": 270 }, { "advantage_max": 1.5989461839199066, "advantage_mean": -9.31322552411018e-09, "advantage_min": -0.8416948765516281, "advantage_std": 0.858425922691822, "completion_length": 2536.104232788086, "epoch": 0.3097142857142857, "grad_norm": 0.32519644498825073, "kl": 0.22412109375, "lambda_div_used": 0.5, "learning_rate": 5.625647374256061e-07, "loss": 0.008, "reward": 0.2996886605396867, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2996886605396867, "reward_after_std": 0.8584259375929832, "reward_before_mean": 1.0987447872757912, "reward_before_std": 0.8029434680938721, "reward_change_max": 0.0, "reward_change_mean": -0.7990561313927174, "reward_change_min": -1.3279625624418259, "reward_change_std": 0.5225053429603577, "reward_std": 0.8584259562194347, "rewards/cosine_scaled_reward": 0.13270571175962687, "rewards/format_reward": 0.8333333414047956, "step": 271 }, { "advantage_max": 1.3123992159962654, "advantage_mean": 1.117587122845265e-08, "advantage_min": -0.5966584831476212, "advantage_std": 0.6955144740641117, "completion_length": 2838.416717529297, "epoch": 0.31085714285714283, "grad_norm": 0.5349856615066528, "kl": 0.3720703125, "lambda_div_used": 0.5, "learning_rate": 5.594240889475106e-07, "loss": 0.0444, "reward": -0.0673510073684156, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0673510073684156, "reward_after_std": 0.6955144926905632, "reward_before_mean": 0.47645336762070656, "reward_before_std": 0.6381653603166342, "reward_change_max": 6.621330976486206e-05, "reward_change_mean": -0.5438043996691704, "reward_change_min": -1.0006888955831528, "reward_change_std": 0.38272993825376034, "reward_std": 0.6955145299434662, "rewards/cosine_scaled_reward": -0.03260663757100701, "rewards/format_reward": 0.5416666772216558, "step": 272 }, { "advantage_max": 1.5722772255539894, "advantage_mean": 1.1796752574788627e-08, "advantage_min": -0.6985844299197197, "advantage_std": 0.8248922377824783, "completion_length": 2693.041717529297, "epoch": 0.312, "grad_norm": 0.406802773475647, "kl": 0.27484130859375, "lambda_div_used": 0.5, "learning_rate": 5.562829811526154e-07, "loss": 0.0212, "reward": 0.11140474304556847, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.11140474304556847, "reward_after_std": 0.8248922675848007, "reward_before_mean": 0.7608576994389296, "reward_before_std": 0.7347896918654442, "reward_change_max": 0.0008430108428001404, "reward_change_mean": -0.6494529545307159, "reward_change_min": -1.1342052109539509, "reward_change_std": 0.4459607619792223, "reward_std": 0.8248922750353813, "rewards/cosine_scaled_reward": 0.04709549807012081, "rewards/format_reward": 0.6666666734963655, "step": 273 }, { "advantage_max": 1.8088025748729706, "advantage_mean": -7.450581041013038e-09, "advantage_min": -0.7626455649733543, "advantage_std": 0.942629911005497, "completion_length": 1852.8542098999023, "epoch": 0.31314285714285717, "grad_norm": 0.9049416780471802, "kl": 0.31414794921875, "lambda_div_used": 0.5, "learning_rate": 5.531415671340826e-07, "loss": 0.0055, "reward": 0.33919756673276424, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.33919756673276424, "reward_after_std": 0.9426299259066582, "reward_before_mean": 1.1372214332222939, "reward_before_std": 0.8105506710708141, "reward_change_max": 0.0016675367951393127, "reward_change_mean": -0.7980238739401102, "reward_change_min": -1.3720234483480453, "reward_change_std": 0.5126860048621893, "reward_std": 0.9426299408078194, "rewards/cosine_scaled_reward": 0.15194405056536198, "rewards/format_reward": 0.833333333954215, "step": 274 }, { "advantage_max": 1.7259643226861954, "advantage_mean": -2.4835267176115394e-09, "advantage_min": -0.7082581743597984, "advantage_std": 0.8969070762395859, "completion_length": 2284.500068664551, "epoch": 0.3142857142857143, "grad_norm": 0.42548516392707825, "kl": 0.208038330078125, "lambda_div_used": 0.5, "learning_rate": 5.5e-07, "loss": 0.0016, "reward": 0.1408673170953989, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1408673170953989, "reward_after_std": 0.8969070613384247, "reward_before_mean": 0.7898597102612257, "reward_before_std": 0.8003634139895439, "reward_change_max": 0.0011668428778648376, "reward_change_mean": -0.6489923857152462, "reward_change_min": -1.132670484483242, "reward_change_std": 0.4332315605133772, "reward_std": 0.8969071060419083, "rewards/cosine_scaled_reward": 0.040763177908957005, "rewards/format_reward": 0.7083333395421505, "step": 275 }, { "advantage_max": 1.8853215798735619, "advantage_mean": -1.241763691872677e-09, "advantage_min": -0.7883929088711739, "advantage_std": 0.9830573312938213, "completion_length": 2414.3750534057617, "epoch": 0.31542857142857145, "grad_norm": 0.8729665279388428, "kl": 0.23760986328125, "lambda_div_used": 0.5, "learning_rate": 5.468584328659172e-07, "loss": 0.0557, "reward": 0.24620786309242249, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.24620786309242249, "reward_after_std": 0.9830573238432407, "reward_before_mean": 0.9521079548285343, "reward_before_std": 0.8866949342191219, "reward_change_max": 0.0, "reward_change_mean": -0.7059001140296459, "reward_change_min": -1.2638509795069695, "reward_change_std": 0.4728654455393553, "reward_std": 0.9830573461949825, "rewards/cosine_scaled_reward": 0.05938731785863638, "rewards/format_reward": 0.833333358168602, "step": 276 }, { "advantage_max": 1.5835720784962177, "advantage_mean": -8.071462442860167e-09, "advantage_min": -0.6679697595536709, "advantage_std": 0.8301307894289494, "completion_length": 2175.2500762939453, "epoch": 0.31657142857142856, "grad_norm": 0.24413251876831055, "kl": 0.170562744140625, "lambda_div_used": 0.5, "learning_rate": 5.437170188473847e-07, "loss": 0.0264, "reward": -0.008413793984800577, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.008413793984800577, "reward_after_std": 0.8301308266818523, "reward_before_mean": 0.5342761669307947, "reward_before_std": 0.77891606092453, "reward_change_max": 0.0025727152824401855, "reward_change_mean": -0.5426899380981922, "reward_change_min": -0.9942139759659767, "reward_change_std": 0.38135170191526413, "reward_std": 0.8301308341324329, "rewards/cosine_scaled_reward": -0.09744528587907553, "rewards/format_reward": 0.7291666753590107, "step": 277 }, { "advantage_max": 1.3841341733932495, "advantage_mean": -1.3038516211150153e-08, "advantage_min": -0.4801356568932533, "advantage_std": 0.7003735899925232, "completion_length": 2088.479202270508, "epoch": 0.3177142857142857, "grad_norm": 0.29018351435661316, "kl": 0.23699951171875, "lambda_div_used": 0.5, "learning_rate": 5.405759110524894e-07, "loss": 0.0285, "reward": 0.23467270750552416, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23467270750552416, "reward_after_std": 0.7003735899925232, "reward_before_mean": 1.0165191926062107, "reward_before_std": 0.43884219601750374, "reward_change_max": 0.0, "reward_change_mean": -0.7818465009331703, "reward_change_min": -1.1315721720457077, "reward_change_std": 0.42579494789242744, "reward_std": 0.7003736048936844, "rewards/cosine_scaled_reward": 0.09159291861578822, "rewards/format_reward": 0.8333333414047956, "step": 278 }, { "advantage_max": 1.7110272645950317, "advantage_mean": -1.2417631367611648e-09, "advantage_min": -0.801856491714716, "advantage_std": 0.9113363847136497, "completion_length": 2521.791732788086, "epoch": 0.31885714285714284, "grad_norm": 0.9733167886734009, "kl": 0.2313232421875, "lambda_div_used": 0.5, "learning_rate": 5.37435262574394e-07, "loss": -0.027, "reward": -0.05211097002029419, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05211097002029419, "reward_after_std": 0.9113363474607468, "reward_before_mean": 0.43769562989473343, "reward_before_std": 0.9335248246788979, "reward_change_max": 0.004060961306095123, "reward_change_mean": -0.48980659805238247, "reward_change_min": -1.0869178883731365, "reward_change_std": 0.42547522112727165, "reward_std": 0.9113363847136497, "rewards/cosine_scaled_reward": -0.0728188632056117, "rewards/format_reward": 0.5833333469927311, "step": 279 }, { "advantage_max": 1.8734301775693893, "advantage_mean": -1.490116141589226e-08, "advantage_min": -0.8812666833400726, "advantage_std": 0.9982732012867928, "completion_length": 2256.062602996826, "epoch": 0.32, "grad_norm": 0.7435464262962341, "kl": 0.198822021484375, "lambda_div_used": 0.5, "learning_rate": 5.342952264838747e-07, "loss": -0.0071, "reward": 0.510563270188868, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.510563270188868, "reward_after_std": 0.998273216187954, "reward_before_mean": 1.439866580069065, "reward_before_std": 0.889154952019453, "reward_change_max": 0.0004360675811767578, "reward_change_mean": -0.9293033089488745, "reward_change_min": -1.5980991758406162, "reward_change_std": 0.6186284609138966, "reward_std": 0.9982732459902763, "rewards/cosine_scaled_reward": 0.32409994560293853, "rewards/format_reward": 0.7916666753590107, "step": 280 }, { "advantage_max": 1.3946446254849434, "advantage_mean": 2.1730860666480112e-08, "advantage_min": -0.6471364013850689, "advantage_std": 0.7381687723100185, "completion_length": 3138.1875915527344, "epoch": 0.3211428571428571, "grad_norm": 0.4560682773590088, "kl": 0.2747802734375, "lambda_div_used": 0.5, "learning_rate": 5.311559558218603e-07, "loss": 0.0362, "reward": -0.31014756578952074, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.31014756578952074, "reward_after_std": 0.7381687834858894, "reward_before_mean": 0.01806825865060091, "reward_before_std": 0.7624465189874172, "reward_change_max": 0.0003070831298828125, "reward_change_mean": -0.3282158114016056, "reward_change_min": -0.7260516695678234, "reward_change_std": 0.30591568164527416, "reward_std": 0.7381688207387924, "rewards/cosine_scaled_reward": -0.20971587905660272, "rewards/format_reward": 0.43750001303851604, "step": 281 }, { "advantage_max": 1.5682580173015594, "advantage_mean": -9.313225690643634e-09, "advantage_min": -0.8357209786772728, "advantage_std": 0.83661337941885, "completion_length": 2523.7500610351562, "epoch": 0.3222857142857143, "grad_norm": 0.4174196720123291, "kl": 0.204254150390625, "lambda_div_used": 0.5, "learning_rate": 5.28017603591974e-07, "loss": 0.028, "reward": 0.19610136304982007, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19610136304982007, "reward_after_std": 0.8366133980453014, "reward_before_mean": 0.918760965578258, "reward_before_std": 0.7789534255862236, "reward_change_max": 0.0, "reward_change_mean": -0.7226595841348171, "reward_change_min": -1.191369317471981, "reward_change_std": 0.48115156777203083, "reward_std": 0.8366134092211723, "rewards/cosine_scaled_reward": 0.05313047394156456, "rewards/format_reward": 0.8125000074505806, "step": 282 }, { "advantage_max": 1.661266028881073, "advantage_mean": -7.450580485901526e-09, "advantage_min": -0.7974419593811035, "advantage_std": 0.876350536942482, "completion_length": 2636.0833740234375, "epoch": 0.32342857142857145, "grad_norm": 1.408146858215332, "kl": 0.2386474609375, "lambda_div_used": 0.5, "learning_rate": 5.248803227530763e-07, "loss": 0.0824, "reward": 0.21048655919730663, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21048655919730663, "reward_after_std": 0.8763505443930626, "reward_before_mean": 0.9238283336162567, "reward_before_std": 0.7904699593782425, "reward_change_max": 0.00020716339349746704, "reward_change_mean": -0.7133418209850788, "reward_change_min": -1.2066475562751293, "reward_change_std": 0.48531679809093475, "reward_std": 0.8763505443930626, "rewards/cosine_scaled_reward": 0.1494141835719347, "rewards/format_reward": 0.6250000093132257, "step": 283 }, { "advantage_max": 1.7019705697894096, "advantage_mean": -9.313227133933566e-10, "advantage_min": -0.6638279147446156, "advantage_std": 0.8716025203466415, "completion_length": 2428.5625762939453, "epoch": 0.32457142857142857, "grad_norm": 0.5000388622283936, "kl": 0.23974609375, "lambda_div_used": 0.5, "learning_rate": 5.21744266211809e-07, "loss": 0.0384, "reward": 0.09703357797116041, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09703357797116041, "reward_after_std": 0.8716025464236736, "reward_before_mean": 0.7079252786934376, "reward_before_std": 0.7274694666266441, "reward_change_max": 0.0, "reward_change_mean": -0.6108916997909546, "reward_change_min": -1.0118683576583862, "reward_change_std": 0.37852455861866474, "reward_std": 0.8716025911271572, "rewards/cosine_scaled_reward": -0.05228736763820052, "rewards/format_reward": 0.8125000111758709, "step": 284 }, { "advantage_max": 1.5750038623809814, "advantage_mean": -9.934107703113426e-09, "advantage_min": -0.7462300956249237, "advantage_std": 0.8351321816444397, "completion_length": 2142.250015258789, "epoch": 0.32571428571428573, "grad_norm": 0.9030725359916687, "kl": 0.224151611328125, "lambda_div_used": 0.5, "learning_rate": 5.186095868151436e-07, "loss": 0.0476, "reward": 0.09167552087455988, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09167552087455988, "reward_after_std": 0.8351321667432785, "reward_before_mean": 0.7201039753854275, "reward_before_std": 0.7814724817872047, "reward_change_max": 0.001250341534614563, "reward_change_mean": -0.6284284666180611, "reward_change_min": -1.201073795557022, "reward_change_std": 0.4616069979965687, "reward_std": 0.8351322039961815, "rewards/cosine_scaled_reward": -0.02536469604820013, "rewards/format_reward": 0.7708333544433117, "step": 285 }, { "advantage_max": 1.4558663815259933, "advantage_mean": 1.0554989632316492e-08, "advantage_min": -0.6718570664525032, "advantage_std": 0.7702252045273781, "completion_length": 2493.041732788086, "epoch": 0.32685714285714285, "grad_norm": 0.7895695567131042, "kl": 0.38580322265625, "lambda_div_used": 0.5, "learning_rate": 5.154764373429315e-07, "loss": 0.0287, "reward": 0.006931816227734089, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.006931816227734089, "reward_after_std": 0.7702252045273781, "reward_before_mean": 0.5846692929044366, "reward_before_std": 0.7240047045052052, "reward_change_max": 5.987286567687988e-05, "reward_change_mean": -0.5777374971657991, "reward_change_min": -1.0274383313953876, "reward_change_std": 0.4023998789489269, "reward_std": 0.7702252194285393, "rewards/cosine_scaled_reward": -0.07224868983030319, "rewards/format_reward": 0.7291666846722364, "step": 286 }, { "advantage_max": 1.680905520915985, "advantage_mean": -4.346172144398253e-09, "advantage_min": -0.6960576623678207, "advantage_std": 0.8782125003635883, "completion_length": 1915.2291870117188, "epoch": 0.328, "grad_norm": 0.5730679035186768, "kl": 0.2940673828125, "lambda_div_used": 0.5, "learning_rate": 5.123449705004581e-07, "loss": 0.023, "reward": 0.05262707732617855, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.05262707732617855, "reward_after_std": 0.8782124929130077, "reward_before_mean": 0.6271408479660749, "reward_before_std": 0.7979765832424164, "reward_change_max": 0.0, "reward_change_mean": -0.5745137967169285, "reward_change_min": -1.0377584993839264, "reward_change_std": 0.40095450915396214, "reward_std": 0.8782125115394592, "rewards/cosine_scaled_reward": -0.09267958626151085, "rewards/format_reward": 0.8125000204890966, "step": 287 }, { "advantage_max": 1.4794694632291794, "advantage_mean": -3.725290742551124e-09, "advantage_min": -0.5603340975940228, "advantage_std": 0.754909448325634, "completion_length": 2463.041748046875, "epoch": 0.3291428571428571, "grad_norm": 1.0325500965118408, "kl": 0.26513671875, "lambda_div_used": 0.5, "learning_rate": 5.09215338910999e-07, "loss": -0.0066, "reward": 0.044862196780741215, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.044862196780741215, "reward_after_std": 0.7549094632267952, "reward_before_mean": 0.6544077503494918, "reward_before_std": 0.5984827503561974, "reward_change_max": 0.0005664229393005371, "reward_change_mean": -0.6095455586910248, "reward_change_min": -0.9598565027117729, "reward_change_std": 0.36767209880053997, "reward_std": 0.7549095004796982, "rewards/cosine_scaled_reward": -0.037379464134573936, "rewards/format_reward": 0.7291666846722364, "step": 288 }, { "advantage_max": 1.2859587520360947, "advantage_mean": -1.1175871061919196e-08, "advantage_min": -0.48796913772821426, "advantage_std": 0.6554525531828403, "completion_length": 1999.9792137145996, "epoch": 0.3302857142857143, "grad_norm": 0.9590998291969299, "kl": 0.24828338623046875, "lambda_div_used": 0.5, "learning_rate": 5.060876951083828e-07, "loss": 0.0059, "reward": 0.0213075689971447, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0213075689971447, "reward_after_std": 0.6554525531828403, "reward_before_mean": 0.6397494054399431, "reward_before_std": 0.4752440471202135, "reward_change_max": 0.0, "reward_change_mean": -0.6184418424963951, "reward_change_min": -0.952071838080883, "reward_change_std": 0.35505982115864754, "reward_std": 0.6554525941610336, "rewards/cosine_scaled_reward": -0.09679198311641812, "rewards/format_reward": 0.8333333395421505, "step": 289 }, { "advantage_max": 1.5296735242009163, "advantage_mean": 0.0, "advantage_min": -0.6690849587321281, "advantage_std": 0.7993335947394371, "completion_length": 2463.6459045410156, "epoch": 0.3314285714285714, "grad_norm": 0.539526641368866, "kl": 0.4915771484375, "lambda_div_used": 0.5, "learning_rate": 5.02962191529556e-07, "loss": 0.0596, "reward": -0.0028451760299503803, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0028451760299503803, "reward_after_std": 0.7993335947394371, "reward_before_mean": 0.5566588691435754, "reward_before_std": 0.7177799604833126, "reward_change_max": 0.0003264695405960083, "reward_change_mean": -0.5595040284097195, "reward_change_min": -0.9658835083246231, "reward_change_std": 0.3854859843850136, "reward_std": 0.7993336021900177, "rewards/cosine_scaled_reward": -0.12792058615013957, "rewards/format_reward": 0.8125000149011612, "step": 290 }, { "advantage_max": 1.5921382904052734, "advantage_mean": -4.346172144398253e-09, "advantage_min": -0.7556582726538181, "advantage_std": 0.8591411933302879, "completion_length": 2490.354232788086, "epoch": 0.3325714285714286, "grad_norm": 0.6908680200576782, "kl": 0.317626953125, "lambda_div_used": 0.5, "learning_rate": 4.998389805071536e-07, "loss": 0.0146, "reward": 0.07940021844115108, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07940021844115108, "reward_after_std": 0.8591412007808685, "reward_before_mean": 0.6981133483350277, "reward_before_std": 0.8550258204340935, "reward_change_max": 0.0007056146860122681, "reward_change_mean": -0.6187131479382515, "reward_change_min": -1.133220985531807, "reward_change_std": 0.4543332364410162, "reward_std": 0.8591412231326103, "rewards/cosine_scaled_reward": -0.046776650473475456, "rewards/format_reward": 0.7916666828095913, "step": 291 }, { "advantage_max": 1.5652444809675217, "advantage_mean": 4.346172199909404e-09, "advantage_min": -0.5440915711224079, "advantage_std": 0.7995546236634254, "completion_length": 2738.916732788086, "epoch": 0.33371428571428574, "grad_norm": 0.6306151151657104, "kl": 0.31866455078125, "lambda_div_used": 0.5, "learning_rate": 4.967182142620745e-07, "loss": 0.0131, "reward": -0.05529059190303087, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.05529059190303087, "reward_after_std": 0.799554631114006, "reward_before_mean": 0.45519457198679447, "reward_before_std": 0.6706855967640877, "reward_change_max": 0.0017582103610038757, "reward_change_mean": -0.5104851890355349, "reward_change_min": -0.9267382770776749, "reward_change_std": 0.3522724714130163, "reward_std": 0.799554668366909, "rewards/cosine_scaled_reward": -0.10573606146499515, "rewards/format_reward": 0.6666666772216558, "step": 292 }, { "advantage_max": 1.276260830461979, "advantage_mean": 6.829698917520943e-09, "advantage_min": -0.570405226200819, "advantage_std": 0.6685235388576984, "completion_length": 2273.875030517578, "epoch": 0.33485714285714285, "grad_norm": 0.9914068579673767, "kl": 0.29083251953125, "lambda_div_used": 0.5, "learning_rate": 4.93600044896063e-07, "loss": 0.0091, "reward": 0.055066212080419064, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.055066212080419064, "reward_after_std": 0.6685235537588596, "reward_before_mean": 0.7027740270714276, "reward_before_std": 0.5311368498951197, "reward_change_max": 0.001424834132194519, "reward_change_mean": -0.6477077975869179, "reward_change_min": -1.0224635303020477, "reward_change_std": 0.40102437511086464, "reward_std": 0.6685235574841499, "rewards/cosine_scaled_reward": -0.08611301146447659, "rewards/format_reward": 0.8750000074505806, "step": 293 }, { "advantage_max": 1.177469477057457, "advantage_mean": -3.1044085080367267e-09, "advantage_min": -0.5789675116539001, "advantage_std": 0.6299779340624809, "completion_length": 3101.604278564453, "epoch": 0.336, "grad_norm": 1.2353851795196533, "kl": 0.46026611328125, "lambda_div_used": 0.5, "learning_rate": 4.904846243842949e-07, "loss": 0.0339, "reward": -0.12046112306416035, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12046112306416035, "reward_after_std": 0.6299779377877712, "reward_before_mean": 0.40089414454996586, "reward_before_std": 0.588392173871398, "reward_change_max": 0.000742591917514801, "reward_change_mean": -0.5213552713394165, "reward_change_min": -0.9144763983786106, "reward_change_std": 0.35968529619276524, "reward_std": 0.6299779377877712, "rewards/cosine_scaled_reward": -0.08080292865633965, "rewards/format_reward": 0.5625000111758709, "step": 294 }, { "advantage_max": 1.599842369556427, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.678341705352068, "advantage_std": 0.8257317095994949, "completion_length": 2566.4583587646484, "epoch": 0.33714285714285713, "grad_norm": 1.0961662530899048, "kl": 0.3182373046875, "lambda_div_used": 0.5, "learning_rate": 4.873721045679706e-07, "loss": -0.0227, "reward": 0.08157170051708817, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08157170051708817, "reward_after_std": 0.8257317095994949, "reward_before_mean": 0.7002139764372259, "reward_before_std": 0.697816614061594, "reward_change_max": 0.000387534499168396, "reward_change_mean": -0.6186422556638718, "reward_change_min": -1.0723202005028725, "reward_change_std": 0.3978155329823494, "reward_std": 0.8257317095994949, "rewards/cosine_scaled_reward": 0.05844030901789665, "rewards/format_reward": 0.5833333395421505, "step": 295 }, { "advantage_max": 1.5283400043845177, "advantage_mean": -2.1730860721991263e-09, "advantage_min": -0.7080317139625549, "advantage_std": 0.8136551566421986, "completion_length": 3081.4166870117188, "epoch": 0.3382857142857143, "grad_norm": 0.6585099697113037, "kl": 0.266845703125, "lambda_div_used": 0.5, "learning_rate": 4.842626371469149e-07, "loss": 0.0465, "reward": -0.008985697524622083, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.008985697524622083, "reward_after_std": 0.81365517526865, "reward_before_mean": 0.5441303681582212, "reward_before_std": 0.784828394651413, "reward_change_max": 0.000543445348739624, "reward_change_mean": -0.5531160607933998, "reward_change_min": -1.0541326105594635, "reward_change_std": 0.4212801605463028, "reward_std": 0.8136552199721336, "rewards/cosine_scaled_reward": -0.07168482430279255, "rewards/format_reward": 0.6875000111758709, "step": 296 }, { "advantage_max": 1.088131882250309, "advantage_mean": 1.4901161637936866e-08, "advantage_min": -0.4907407984137535, "advantage_std": 0.5779940336942673, "completion_length": 3145.0625610351562, "epoch": 0.3394285714285714, "grad_norm": 0.2617074251174927, "kl": 0.2529296875, "lambda_div_used": 0.5, "learning_rate": 4.811563736721829e-07, "loss": 0.0091, "reward": -0.3666674308478832, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3666674308478832, "reward_after_std": 0.5779940262436867, "reward_before_mean": -0.03471547598019242, "reward_before_std": 0.5734328739345074, "reward_change_max": 0.0015069320797920227, "reward_change_mean": -0.33195194229483604, "reward_change_min": -0.6547215916216373, "reward_change_std": 0.2700279410928488, "reward_std": 0.5779940336942673, "rewards/cosine_scaled_reward": -0.1527744084596634, "rewards/format_reward": 0.27083333767950535, "step": 297 }, { "advantage_max": 1.5531343445181847, "advantage_mean": 7.450580929990736e-09, "advantage_min": -0.6577443964779377, "advantage_std": 0.8128904439508915, "completion_length": 2198.916748046875, "epoch": 0.3405714285714286, "grad_norm": 0.2890186011791229, "kl": 0.135406494140625, "lambda_div_used": 0.5, "learning_rate": 4.780534655386743e-07, "loss": 0.0164, "reward": 0.01897238101810217, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.01897238101810217, "reward_after_std": 0.8128904215991497, "reward_before_mean": 0.5919715128839016, "reward_before_std": 0.7551921270787716, "reward_change_max": 0.005190655589103699, "reward_change_mean": -0.5729991532862186, "reward_change_min": -1.0509915724396706, "reward_change_std": 0.4005443025380373, "reward_std": 0.8128904551267624, "rewards/cosine_scaled_reward": -0.05818091053515673, "rewards/format_reward": 0.7083333525806665, "step": 298 }, { "advantage_max": 1.967075452208519, "advantage_mean": 1.1175871117430347e-08, "advantage_min": -0.8121235743165016, "advantage_std": 1.0208548679947853, "completion_length": 2885.604217529297, "epoch": 0.3417142857142857, "grad_norm": 0.44406768679618835, "kl": 0.1641845703125, "lambda_div_used": 0.5, "learning_rate": 4.749540639777539e-07, "loss": -0.0042, "reward": 0.03758762776851654, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.03758762776851654, "reward_after_std": 1.0208548456430435, "reward_before_mean": 0.5578274028375745, "reward_before_std": 0.9705353602766991, "reward_change_max": 0.0003610551357269287, "reward_change_mean": -0.5202397517859936, "reward_change_min": -1.0511676035821438, "reward_change_std": 0.4258074313402176, "reward_std": 1.0208548977971077, "rewards/cosine_scaled_reward": 0.02891370188444853, "rewards/format_reward": 0.5000000167638063, "step": 299 }, { "advantage_max": 1.2897720709443092, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.5727378912270069, "advantage_std": 0.678062092512846, "completion_length": 2925.1875610351562, "epoch": 0.34285714285714286, "grad_norm": 0.2789536714553833, "kl": 0.170654296875, "lambda_div_used": 0.5, "learning_rate": 4.7185832004988133e-07, "loss": 0.0069, "reward": -0.25556246004998684, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.25556246004998684, "reward_after_std": 0.6780621074140072, "reward_before_mean": 0.1323627531528473, "reward_before_std": 0.6543583087623119, "reward_change_max": 0.0016498491168022156, "reward_change_mean": -0.3879252327606082, "reward_change_min": -0.8045506216585636, "reward_change_std": 0.31338599789887667, "reward_std": 0.6780621185898781, "rewards/cosine_scaled_reward": -0.18381863087415695, "rewards/format_reward": 0.5000000074505806, "step": 300 }, { "advantage_max": 1.3899911418557167, "advantage_mean": 2.483526884144993e-09, "advantage_min": -0.5897494703531265, "advantage_std": 0.742620075121522, "completion_length": 2620.125030517578, "epoch": 0.344, "grad_norm": 0.49501293897628784, "kl": 0.1964111328125, "lambda_div_used": 0.5, "learning_rate": 4.68766384637248e-07, "loss": 0.0368, "reward": -0.05963777285069227, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.05963777285069227, "reward_after_std": 0.7426200993359089, "reward_before_mean": 0.4776543521657004, "reward_before_std": 0.7033826969563961, "reward_change_max": 0.0021210387349128723, "reward_change_mean": -0.5372921079397202, "reward_change_min": -1.0695495195686817, "reward_change_std": 0.414053525775671, "reward_std": 0.7426201142370701, "rewards/cosine_scaled_reward": -0.04242282547056675, "rewards/format_reward": 0.5625000037252903, "step": 301 }, { "advantage_max": 1.5067075043916702, "advantage_mean": 1.1175871339474952e-08, "advantage_min": -0.6437755972146988, "advantage_std": 0.7938609048724174, "completion_length": 2604.479248046875, "epoch": 0.34514285714285714, "grad_norm": 0.6377979516983032, "kl": 0.15997314453125, "lambda_div_used": 0.5, "learning_rate": 4.656784084364238e-07, "loss": -0.0173, "reward": -0.1051476038992405, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1051476038992405, "reward_after_std": 0.7938608899712563, "reward_before_mean": 0.3738048989325762, "reward_before_std": 0.768853772431612, "reward_change_max": 0.002476900815963745, "reward_change_mean": -0.4789524972438812, "reward_change_min": -0.9946075230836868, "reward_change_std": 0.3719355911016464, "reward_std": 0.7938609048724174, "rewards/cosine_scaled_reward": -0.08393089659512043, "rewards/format_reward": 0.5416666753590107, "step": 302 }, { "advantage_max": 1.6140480786561966, "advantage_mean": -1.924733383784627e-08, "advantage_min": -0.6462118700146675, "advantage_std": 0.8338347300887108, "completion_length": 2591.8542098999023, "epoch": 0.3462857142857143, "grad_norm": 0.371255487203598, "kl": 0.1746826171875, "lambda_div_used": 0.5, "learning_rate": 4.6259454195101267e-07, "loss": 0.0434, "reward": -0.06317769235465676, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.06317769235465676, "reward_after_std": 0.8338347561657429, "reward_before_mean": 0.42486920207738876, "reward_before_std": 0.7611415684223175, "reward_change_max": 0.0010538250207901, "reward_change_mean": -0.4880469013005495, "reward_change_min": -0.8825994059443474, "reward_change_std": 0.3486558496952057, "reward_std": 0.8338347896933556, "rewards/cosine_scaled_reward": -0.12089874129742384, "rewards/format_reward": 0.6666666734963655, "step": 303 }, { "advantage_max": 1.1657679006457329, "advantage_mean": 5.587935225648266e-09, "advantage_min": -0.6645041145384312, "advantage_std": 0.6425657123327255, "completion_length": 2899.854232788086, "epoch": 0.3474285714285714, "grad_norm": 0.6334242820739746, "kl": 0.21673583984375, "lambda_div_used": 0.5, "learning_rate": 4.59514935484316e-07, "loss": 0.0462, "reward": -0.1580784060060978, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.1580784060060978, "reward_after_std": 0.6425657048821449, "reward_before_mean": 0.3347341902554035, "reward_before_std": 0.6689680777490139, "reward_change_max": 0.0011926591396331787, "reward_change_mean": -0.4928125822916627, "reward_change_min": -0.8947502039372921, "reward_change_std": 0.3764376938343048, "reward_std": 0.6425657123327255, "rewards/cosine_scaled_reward": -0.12429956905543804, "rewards/format_reward": 0.5833333525806665, "step": 304 }, { "advantage_max": 1.5461205169558525, "advantage_mean": 1.4280279847511679e-08, "advantage_min": -0.6990028731524944, "advantage_std": 0.8203661367297173, "completion_length": 3038.5625762939453, "epoch": 0.3485714285714286, "grad_norm": 0.5466167330741882, "kl": 0.18798828125, "lambda_div_used": 0.5, "learning_rate": 4.5643973913200837e-07, "loss": 0.0264, "reward": -0.13418531976640224, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.13418531976640224, "reward_after_std": 0.820366133004427, "reward_before_mean": 0.31502748280763626, "reward_before_std": 0.8251273334026337, "reward_change_max": 0.0004555061459541321, "reward_change_mean": -0.44921278581023216, "reward_change_min": -0.9437504261732101, "reward_change_std": 0.3788600452244282, "reward_std": 0.8203661553561687, "rewards/cosine_scaled_reward": -0.10290294280275702, "rewards/format_reward": 0.5208333469927311, "step": 305 }, { "advantage_max": 1.7352852076292038, "advantage_mean": -1.8626450382086546e-09, "advantage_min": -0.6884033642709255, "advantage_std": 0.8893226571381092, "completion_length": 2894.916717529297, "epoch": 0.3497142857142857, "grad_norm": 0.5606780052185059, "kl": 0.16204833984375, "lambda_div_used": 0.5, "learning_rate": 4.5336910277482155e-07, "loss": 0.0152, "reward": 0.17971285339444876, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17971285339444876, "reward_after_std": 0.8893226645886898, "reward_before_mean": 0.8592891084699659, "reward_before_std": 0.7582590952515602, "reward_change_max": 0.0, "reward_change_mean": -0.6795762628316879, "reward_change_min": -1.081557810306549, "reward_change_std": 0.4216056726872921, "reward_std": 0.8893226906657219, "rewards/cosine_scaled_reward": 0.08589455112814903, "rewards/format_reward": 0.6875000074505806, "step": 306 }, { "advantage_max": 1.7301098704338074, "advantage_mean": 1.2417633588057697e-09, "advantage_min": -0.7147096432745457, "advantage_std": 0.8947756588459015, "completion_length": 2657.187530517578, "epoch": 0.35085714285714287, "grad_norm": 0.34765297174453735, "kl": 0.2095947265625, "lambda_div_used": 0.5, "learning_rate": 4.503031760712397e-07, "loss": 0.0495, "reward": 0.019154822453856468, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.019154822453856468, "reward_after_std": 0.8947756588459015, "reward_before_mean": 0.5600891445064917, "reward_before_std": 0.8151229023933411, "reward_change_max": 0.0, "reward_change_mean": -0.5409342758357525, "reward_change_min": -1.0163558684289455, "reward_change_std": 0.38923518545925617, "reward_std": 0.8947756960988045, "rewards/cosine_scaled_reward": -0.022038788767531514, "rewards/format_reward": 0.6041666679084301, "step": 307 }, { "advantage_max": 1.384334035217762, "advantage_mean": 4.9670538238011375e-09, "advantage_min": -0.6601945385336876, "advantage_std": 0.7446734458208084, "completion_length": 3214.0834045410156, "epoch": 0.352, "grad_norm": 0.5936756134033203, "kl": 0.29541015625, "lambda_div_used": 0.5, "learning_rate": 4.4724210845020494e-07, "loss": 0.0293, "reward": -0.14771428517997265, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.14771428517997265, "reward_after_std": 0.7446734383702278, "reward_before_mean": 0.31862270552664995, "reward_before_std": 0.7598531804978848, "reward_change_max": 0.0, "reward_change_mean": -0.46633700653910637, "reward_change_min": -0.9204640761017799, "reward_change_std": 0.36674703285098076, "reward_std": 0.744673453271389, "rewards/cosine_scaled_reward": -0.14277198538184166, "rewards/format_reward": 0.6041666809469461, "step": 308 }, { "advantage_max": 1.5429324805736542, "advantage_mean": -2.483527050678447e-09, "advantage_min": -0.635457769036293, "advantage_std": 0.808469258248806, "completion_length": 2883.6875610351562, "epoch": 0.35314285714285715, "grad_norm": 0.34315183758735657, "kl": 0.21173095703125, "lambda_div_used": 0.5, "learning_rate": 4.441860491038345e-07, "loss": 0.0044, "reward": 0.039851417765021324, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.039851417765021324, "reward_after_std": 0.8084692545235157, "reward_before_mean": 0.6314571984112263, "reward_before_std": 0.7366010136902332, "reward_change_max": 0.0005460754036903381, "reward_change_mean": -0.5916057825088501, "reward_change_min": -1.1304726675152779, "reward_change_std": 0.40859665535390377, "reward_std": 0.8084692656993866, "rewards/cosine_scaled_reward": -0.05927141313441098, "rewards/format_reward": 0.7500000111758709, "step": 309 }, { "advantage_max": 1.740215465426445, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.6650184690952301, "advantage_std": 0.8944349363446236, "completion_length": 2405.479217529297, "epoch": 0.35428571428571426, "grad_norm": 0.8915708661079407, "kl": 0.242462158203125, "lambda_div_used": 0.5, "learning_rate": 4.4113514698014953e-07, "loss": 0.0609, "reward": -0.012727348133921623, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.012727348133921623, "reward_after_std": 0.8944349437952042, "reward_before_mean": 0.4980372078716755, "reward_before_std": 0.8056224025785923, "reward_change_max": 0.0, "reward_change_mean": -0.5107645392417908, "reward_change_min": -0.9406066909432411, "reward_change_std": 0.35088925808668137, "reward_std": 0.894434966146946, "rewards/cosine_scaled_reward": -0.11556475143879652, "rewards/format_reward": 0.7291666734963655, "step": 310 }, { "advantage_max": 1.7507240772247314, "advantage_mean": -1.8626452047421083e-08, "advantage_min": -0.8782271668314934, "advantage_std": 0.9409815222024918, "completion_length": 2372.916717529297, "epoch": 0.3554285714285714, "grad_norm": 0.933577835559845, "kl": 0.16912841796875, "lambda_div_used": 0.5, "learning_rate": 4.3808955077581546e-07, "loss": 0.017, "reward": 0.17776218801736832, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17776218801736832, "reward_after_std": 0.9409815445542336, "reward_before_mean": 0.8487966532702558, "reward_before_std": 0.9439231902360916, "reward_change_max": 0.0004132986068725586, "reward_change_mean": -0.6710344441235065, "reward_change_min": -1.344845950603485, "reward_change_std": 0.5198050625622272, "reward_std": 0.9409816116094589, "rewards/cosine_scaled_reward": 0.08064829930663109, "rewards/format_reward": 0.6875000186264515, "step": 311 }, { "advantage_max": 1.6873877942562103, "advantage_mean": -3.725290076417309e-09, "advantage_min": -0.6372330226004124, "advantage_std": 0.8659485727548599, "completion_length": 2212.4792098999023, "epoch": 0.3565714285714286, "grad_norm": 0.9566351771354675, "kl": 0.2457275390625, "lambda_div_used": 0.5, "learning_rate": 4.350494089288943e-07, "loss": -0.0267, "reward": 0.2527286000549793, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2527286000549793, "reward_after_std": 0.8659485355019569, "reward_before_mean": 0.9952520411461592, "reward_before_std": 0.6744127124547958, "reward_change_max": 0.0006193891167640686, "reward_change_mean": -0.7425234168767929, "reward_change_min": -1.1924034729599953, "reward_change_std": 0.4576085638254881, "reward_std": 0.8659485578536987, "rewards/cosine_scaled_reward": 0.18512600846588612, "rewards/format_reward": 0.6250000018626451, "step": 312 }, { "advantage_max": 1.463294543325901, "advantage_mean": -1.4901161526914564e-08, "advantage_min": -0.5900123342871666, "advantage_std": 0.7641986832022667, "completion_length": 2953.7083587646484, "epoch": 0.3577142857142857, "grad_norm": 89.38374328613281, "kl": 2.8055419921875, "lambda_div_used": 0.5, "learning_rate": 4.3201486961161093e-07, "loss": 0.0373, "reward": 0.10185758583247662, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.10185758583247662, "reward_after_std": 0.7641987055540085, "reward_before_mean": 0.7619487410411239, "reward_before_std": 0.6133890030905604, "reward_change_max": 0.0006252899765968323, "reward_change_mean": -0.6600911617279053, "reward_change_min": -1.149497613310814, "reward_change_std": 0.4455935023725033, "reward_std": 0.7641987279057503, "rewards/cosine_scaled_reward": 0.05805770156439394, "rewards/format_reward": 0.645833345130086, "step": 313 }, { "advantage_max": 1.4923174902796745, "advantage_mean": -4.34617203337595e-09, "advantage_min": -0.5729423686861992, "advantage_std": 0.7636104431003332, "completion_length": 2375.666717529297, "epoch": 0.3588571428571429, "grad_norm": 0.924056351184845, "kl": 0.2772216796875, "lambda_div_used": 0.5, "learning_rate": 4.2898608072313045e-07, "loss": 0.0017, "reward": 0.1338311405852437, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1338311405852437, "reward_after_std": 0.7636104375123978, "reward_before_mean": 0.8161500915884972, "reward_before_std": 0.5966301336884499, "reward_change_max": 0.0, "reward_change_mean": -0.682318925857544, "reward_change_min": -1.1118975020945072, "reward_change_std": 0.397566681727767, "reward_std": 0.7636104635894299, "rewards/cosine_scaled_reward": 0.022658362751826644, "rewards/format_reward": 0.7708333358168602, "step": 314 }, { "advantage_max": 1.401157207787037, "advantage_mean": 1.3659398390153399e-08, "advantage_min": -0.5785900503396988, "advantage_std": 0.7340905368328094, "completion_length": 2862.1666870117188, "epoch": 0.36, "grad_norm": 0.5284225940704346, "kl": 0.3608856201171875, "lambda_div_used": 0.5, "learning_rate": 4.2596318988235037e-07, "loss": 0.0091, "reward": -0.15376823209226131, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.15376823209226131, "reward_after_std": 0.7340905666351318, "reward_before_mean": 0.2963081202469766, "reward_before_std": 0.6793590821325779, "reward_change_max": 0.001107342541217804, "reward_change_mean": -0.4500763714313507, "reward_change_min": -0.8281183242797852, "reward_change_std": 0.3382910368964076, "reward_std": 0.7340905852615833, "rewards/cosine_scaled_reward": -0.09142925776541233, "rewards/format_reward": 0.47916667722165585, "step": 315 }, { "advantage_max": 1.1306168586015701, "advantage_mean": 1.1796752796833232e-08, "advantage_min": -0.48234958946704865, "advantage_std": 0.5896905846893787, "completion_length": 3272.604217529297, "epoch": 0.36114285714285715, "grad_norm": 0.6392146348953247, "kl": 0.358642578125, "lambda_div_used": 0.5, "learning_rate": 4.2294634442070553e-07, "loss": 0.0484, "reward": -0.3114443914964795, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.3114443914964795, "reward_after_std": 0.589690588414669, "reward_before_mean": 0.05901812016963959, "reward_before_std": 0.5436771884560585, "reward_change_max": 0.0008131638169288635, "reward_change_mean": -0.37046249210834503, "reward_change_min": -0.6907815337181091, "reward_change_std": 0.2691768379881978, "reward_std": 0.5896905958652496, "rewards/cosine_scaled_reward": -0.23090762086212635, "rewards/format_reward": 0.5208333414047956, "step": 316 }, { "advantage_max": 1.5040301159024239, "advantage_mean": -1.241763458725842e-08, "advantage_min": -0.5454112328588963, "advantage_std": 0.7728907950222492, "completion_length": 2862.3959197998047, "epoch": 0.36228571428571427, "grad_norm": 0.7288563251495361, "kl": 0.33740234375, "lambda_div_used": 0.5, "learning_rate": 4.1993569137498776e-07, "loss": 0.0474, "reward": 0.04084273800253868, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.04084273800253868, "reward_after_std": 0.7728907912969589, "reward_before_mean": 0.6436011524274363, "reward_before_std": 0.6424034424126148, "reward_change_max": 0.0, "reward_change_mean": -0.6027584280818701, "reward_change_min": -1.0649547278881073, "reward_change_std": 0.381022609770298, "reward_std": 0.7728908061981201, "rewards/cosine_scaled_reward": -0.03236610069870949, "rewards/format_reward": 0.7083333488553762, "step": 317 }, { "advantage_max": 1.589605301618576, "advantage_mean": 1.2728075593493315e-08, "advantage_min": -0.6930773742496967, "advantage_std": 0.8379422500729561, "completion_length": 2228.4375610351562, "epoch": 0.36342857142857143, "grad_norm": 0.26594236493110657, "kl": 0.27484130859375, "lambda_div_used": 0.5, "learning_rate": 4.1693137748017915e-07, "loss": 0.0243, "reward": 0.030790013261139393, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.030790013261139393, "reward_after_std": 0.8379422500729561, "reward_before_mean": 0.6122659891843796, "reward_before_std": 0.7788247428834438, "reward_change_max": 0.0004334002733230591, "reward_change_mean": -0.5814759768545628, "reward_change_min": -1.061459630727768, "reward_change_std": 0.4175482243299484, "reward_std": 0.8379422500729561, "rewards/cosine_scaled_reward": -0.13136701984331012, "rewards/format_reward": 0.8750000149011612, "step": 318 }, { "advantage_max": 1.1976237669587135, "advantage_mean": 6.674478525425798e-09, "advantage_min": -0.5388987213373184, "advantage_std": 0.6269382648169994, "completion_length": 3055.416702270508, "epoch": 0.36457142857142855, "grad_norm": 0.41507601737976074, "kl": 0.32904052734375, "lambda_div_used": 0.5, "learning_rate": 4.1393354916230005e-07, "loss": 0.0276, "reward": -0.17632382595911622, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.17632382595911622, "reward_after_std": 0.6269382573664188, "reward_before_mean": 0.29730080626904964, "reward_before_std": 0.5578728318214417, "reward_change_max": 0.0006338581442832947, "reward_change_mean": -0.4736246280372143, "reward_change_min": -0.7681119628250599, "reward_change_std": 0.3120098374783993, "reward_std": 0.6269382685422897, "rewards/cosine_scaled_reward": -0.1638496033847332, "rewards/format_reward": 0.6250000167638063, "step": 319 }, { "advantage_max": 1.8235188126564026, "advantage_mean": 6.208817349140361e-09, "advantage_min": -0.7356794327497482, "advantage_std": 0.9437028914690018, "completion_length": 2396.0626220703125, "epoch": 0.3657142857142857, "grad_norm": 0.5142975449562073, "kl": 0.2520751953125, "lambda_div_used": 0.5, "learning_rate": 4.1094235253127374e-07, "loss": 0.0213, "reward": 0.07883075065910816, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07883075065910816, "reward_after_std": 0.9437028840184212, "reward_before_mean": 0.6556731648743153, "reward_before_std": 0.8697749823331833, "reward_change_max": 0.0, "reward_change_mean": -0.5768423937261105, "reward_change_min": -1.047103874385357, "reward_change_std": 0.40687838755548, "reward_std": 0.9437028951942921, "rewards/cosine_scaled_reward": -0.08883010782301426, "rewards/format_reward": 0.8333333395421505, "step": 320 }, { "advantage_max": 1.9276015385985374, "advantage_mean": -1.3038516599728212e-08, "advantage_min": -0.7980079278349876, "advantage_std": 0.99143286049366, "completion_length": 2184.812545776367, "epoch": 0.3668571428571429, "grad_norm": 0.793869137763977, "kl": 0.189727783203125, "lambda_div_used": 0.5, "learning_rate": 4.079579333738039e-07, "loss": 0.0287, "reward": 0.42022357787936926, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.42022357787936926, "reward_after_std": 0.9914328753948212, "reward_before_mean": 1.2628098390996456, "reward_before_std": 0.8046529665589333, "reward_change_max": 0.001957610249519348, "reward_change_mean": -0.8425862416625023, "reward_change_min": -1.369489625096321, "reward_change_std": 0.5107485167682171, "reward_std": 0.9914328753948212, "rewards/cosine_scaled_reward": 0.18348823045380414, "rewards/format_reward": 0.8958333507180214, "step": 321 }, { "advantage_max": 1.6895935460925102, "advantage_mean": -5.587935503204022e-09, "advantage_min": -0.6205887608230114, "advantage_std": 0.8657153844833374, "completion_length": 2706.1250915527344, "epoch": 0.368, "grad_norm": 0.3828519582748413, "kl": 0.29534912109375, "lambda_div_used": 0.5, "learning_rate": 4.0498043714627006e-07, "loss": 0.0165, "reward": -0.05292603746056557, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.05292603746056557, "reward_after_std": 0.8657153844833374, "reward_before_mean": 0.4366830997169018, "reward_before_std": 0.773483332246542, "reward_change_max": 0.0, "reward_change_mean": -0.48960915207862854, "reward_change_min": -0.926708921790123, "reward_change_std": 0.3389029707759619, "reward_std": 0.8657154068350792, "rewards/cosine_scaled_reward": -0.11499179247766733, "rewards/format_reward": 0.6666666902601719, "step": 322 }, { "advantage_max": 1.400377780199051, "advantage_mean": -2.4835271617007493e-09, "advantage_min": -0.6002854071557522, "advantage_std": 0.7208580374717712, "completion_length": 2751.1458892822266, "epoch": 0.36914285714285716, "grad_norm": 0.3763584792613983, "kl": 0.2479248046875, "lambda_div_used": 0.5, "learning_rate": 4.020100089676376e-07, "loss": 0.0295, "reward": -0.013417241163551807, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.013417241163551807, "reward_after_std": 0.7208580300211906, "reward_before_mean": 0.5566643313504755, "reward_before_std": 0.5935582704842091, "reward_change_max": 0.0009166598320007324, "reward_change_mean": -0.5700815692543983, "reward_change_min": -0.9349125400185585, "reward_change_std": 0.3692823648452759, "reward_std": 0.7208580449223518, "rewards/cosine_scaled_reward": -0.09666784037835896, "rewards/format_reward": 0.7500000204890966, "step": 323 }, { "advantage_max": 1.7073202952742577, "advantage_mean": 1.2417634476236117e-08, "advantage_min": -0.6523040719330311, "advantage_std": 0.8818818256258965, "completion_length": 3288.0000915527344, "epoch": 0.3702857142857143, "grad_norm": 0.9621612429618835, "kl": 0.3841552734375, "lambda_div_used": 0.5, "learning_rate": 3.9904679361238526e-07, "loss": 0.07, "reward": -0.23006662633270025, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.23006662633270025, "reward_after_std": 0.881881844252348, "reward_before_mean": 0.11532109789550304, "reward_before_std": 0.8703635148704052, "reward_change_max": 0.0007134005427360535, "reward_change_mean": -0.3453877214342356, "reward_change_min": -0.7478644847869873, "reward_change_std": 0.2995779123157263, "reward_std": 0.881881844252348, "rewards/cosine_scaled_reward": -0.20275612798286602, "rewards/format_reward": 0.5208333488553762, "step": 324 }, { "advantage_max": 1.7375487461686134, "advantage_mean": 1.1796752963366686e-08, "advantage_min": -0.6599301993846893, "advantage_std": 0.9019149504601955, "completion_length": 2948.1250915527344, "epoch": 0.37142857142857144, "grad_norm": 0.44344088435173035, "kl": 0.3438720703125, "lambda_div_used": 0.5, "learning_rate": 3.9609093550344907e-07, "loss": 0.0385, "reward": -0.017678143922239542, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.017678143922239542, "reward_after_std": 0.9019149504601955, "reward_before_mean": 0.4983112784102559, "reward_before_std": 0.8421217501163483, "reward_change_max": 0.0005606338381767273, "reward_change_mean": -0.5159894041717052, "reward_change_min": -1.0590049587190151, "reward_change_std": 0.38723311573266983, "reward_std": 0.9019149765372276, "rewards/cosine_scaled_reward": -0.0945943733677268, "rewards/format_reward": 0.6875000074505806, "step": 325 }, { "advantage_max": 1.5786890238523483, "advantage_mean": 2.2972624136308184e-08, "advantage_min": -0.6869267821311951, "advantage_std": 0.8400480523705482, "completion_length": 2644.041748046875, "epoch": 0.37257142857142855, "grad_norm": 0.5172160863876343, "kl": 0.26190185546875, "lambda_div_used": 0.5, "learning_rate": 3.931425787051832e-07, "loss": 0.0359, "reward": 0.14027714263647795, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14027714263647795, "reward_after_std": 0.8400480523705482, "reward_before_mean": 0.813402040861547, "reward_before_std": 0.748632000759244, "reward_change_max": 0.0004016384482383728, "reward_change_mean": -0.6731249168515205, "reward_change_min": -1.1637303456664085, "reward_change_std": 0.4690163619816303, "reward_std": 0.8400481045246124, "rewards/cosine_scaled_reward": -0.009965650620870292, "rewards/format_reward": 0.8333333432674408, "step": 326 }, { "advantage_max": 2.1002472937107086, "advantage_mean": -1.8626451825376478e-08, "advantage_min": -0.9241488240659237, "advantage_std": 1.1009439006447792, "completion_length": 2588.14591217041, "epoch": 0.3737142857142857, "grad_norm": 0.4935661256313324, "kl": 0.310272216796875, "lambda_div_used": 0.5, "learning_rate": 3.902018669163384e-07, "loss": 0.0476, "reward": 0.3117635138332844, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3117635138332844, "reward_after_std": 1.100943885743618, "reward_before_mean": 1.0368181890808046, "reward_before_std": 1.0324792116880417, "reward_change_max": 0.0, "reward_change_mean": -0.7250546813011169, "reward_change_min": -1.3601718544960022, "reward_change_std": 0.5242071263492107, "reward_std": 1.1009439453482628, "rewards/cosine_scaled_reward": 0.09132576221600175, "rewards/format_reward": 0.8541666939854622, "step": 327 }, { "advantage_max": 1.6814225018024445, "advantage_mean": 7.450580929990736e-09, "advantage_min": -0.6006612703204155, "advantage_std": 0.8554463051259518, "completion_length": 3272.916748046875, "epoch": 0.37485714285714283, "grad_norm": 0.5845516920089722, "kl": 0.413330078125, "lambda_div_used": 0.5, "learning_rate": 3.872689434630585e-07, "loss": 0.0414, "reward": -0.14344895351678133, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.14344895351678133, "reward_after_std": 0.8554462976753712, "reward_before_mean": 0.27615911699831486, "reward_before_std": 0.7600311264395714, "reward_change_max": 0.0007801279425621033, "reward_change_mean": -0.41960807144641876, "reward_change_min": -0.7604356594383717, "reward_change_std": 0.3008067738264799, "reward_std": 0.8554463237524033, "rewards/cosine_scaled_reward": -0.12233711747103371, "rewards/format_reward": 0.5208333488553762, "step": 328 }, { "advantage_max": 1.6130209863185883, "advantage_mean": -6.829699028543246e-09, "advantage_min": -0.7729388028383255, "advantage_std": 0.8566270098090172, "completion_length": 2137.8750762939453, "epoch": 0.376, "grad_norm": 1.0926064252853394, "kl": 0.299530029296875, "lambda_div_used": 0.5, "learning_rate": 3.843439512918949e-07, "loss": -0.002, "reward": 0.22257465310394764, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22257465310394764, "reward_after_std": 0.8566270098090172, "reward_before_mean": 0.9557426117826253, "reward_before_std": 0.7563247159123421, "reward_change_max": 0.0, "reward_change_mean": -0.7331679500639439, "reward_change_min": -1.2459847666323185, "reward_change_std": 0.49254793860018253, "reward_std": 0.8566270247101784, "rewards/cosine_scaled_reward": 0.10287128575146198, "rewards/format_reward": 0.7500000186264515, "step": 329 }, { "advantage_max": 1.5837075859308243, "advantage_mean": 1.4901161526914564e-08, "advantage_min": -0.5107849761843681, "advantage_std": 0.7936838679015636, "completion_length": 2223.2083740234375, "epoch": 0.37714285714285717, "grad_norm": 0.39597129821777344, "kl": 0.3514556884765625, "lambda_div_used": 0.5, "learning_rate": 3.8142703296283953e-07, "loss": 0.0206, "reward": -0.12900587869808078, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.12900587869808078, "reward_after_std": 0.7936838828027248, "reward_before_mean": 0.3146856687963009, "reward_before_std": 0.6586159784346819, "reward_change_max": 0.0, "reward_change_mean": -0.44369154796004295, "reward_change_min": -0.7083565294742584, "reward_change_std": 0.2738402709364891, "reward_std": 0.7936839014291763, "rewards/cosine_scaled_reward": -0.24890717677772045, "rewards/format_reward": 0.8125000074505806, "step": 330 }, { "advantage_max": 1.1777547150850296, "advantage_mean": 1.179675318541129e-08, "advantage_min": -0.5273271761834621, "advantage_std": 0.6165788248181343, "completion_length": 2718.604217529297, "epoch": 0.3782857142857143, "grad_norm": 0.4538213908672333, "kl": 0.3648681640625, "lambda_div_used": 0.5, "learning_rate": 3.785183306423767e-07, "loss": 0.0276, "reward": -0.22839653585106134, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.22839653585106134, "reward_after_std": 0.6165788136422634, "reward_before_mean": 0.20412603858858347, "reward_before_std": 0.5550496280193329, "reward_change_max": 0.0007744207978248596, "reward_change_mean": -0.43252256885170937, "reward_change_min": -0.7282052636146545, "reward_change_std": 0.3019270282238722, "reward_std": 0.6165788173675537, "rewards/cosine_scaled_reward": -0.1896036472171545, "rewards/format_reward": 0.5833333432674408, "step": 331 }, { "advantage_max": 1.5473309606313705, "advantage_mean": 2.4835262735223296e-09, "advantage_min": -0.61005724593997, "advantage_std": 0.7968885004520416, "completion_length": 2720.729232788086, "epoch": 0.37942857142857145, "grad_norm": 0.47542330622673035, "kl": 0.347320556640625, "lambda_div_used": 0.5, "learning_rate": 3.7561798609655373e-07, "loss": 0.0237, "reward": 0.0141659677028656, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0141659677028656, "reward_after_std": 0.796888493001461, "reward_before_mean": 0.5891684554517269, "reward_before_std": 0.6871049627661705, "reward_change_max": 0.003196209669113159, "reward_change_mean": -0.5750024765729904, "reward_change_min": -1.0247270502150059, "reward_change_std": 0.3719689790159464, "reward_std": 0.7968885004520416, "rewards/cosine_scaled_reward": -0.10124912392348051, "rewards/format_reward": 0.7916666772216558, "step": 332 }, { "advantage_max": 1.4445695504546165, "advantage_mean": -1.528921261817473e-08, "advantage_min": -0.6396168023347855, "advantage_std": 0.7692903093993664, "completion_length": 2287.0833892822266, "epoch": 0.38057142857142856, "grad_norm": 0.7400115728378296, "kl": 0.20025634765625, "lambda_div_used": 0.5, "learning_rate": 3.72726140684072e-07, "loss": 0.0036, "reward": 0.1459917591419071, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1459917591419071, "reward_after_std": 0.7692903243005276, "reward_before_mean": 0.8459194973111153, "reward_before_std": 0.6958840638399124, "reward_change_max": 0.0, "reward_change_mean": -0.6999277397990227, "reward_change_min": -1.225555181503296, "reward_change_std": 0.4582208953797817, "reward_std": 0.7692903392016888, "rewards/cosine_scaled_reward": -0.06662360485643148, "rewards/format_reward": 0.9791666716337204, "step": 333 }, { "advantage_max": 1.1501679047942162, "advantage_mean": 2.483526884144993e-09, "advantage_min": -0.5715098641812801, "advantage_std": 0.6181285083293915, "completion_length": 3047.8334045410156, "epoch": 0.38171428571428573, "grad_norm": 0.9689967036247253, "kl": 0.4241943359375, "lambda_div_used": 0.5, "learning_rate": 3.6984293534939737e-07, "loss": 0.0217, "reward": -0.1921203788369894, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.1921203788369894, "reward_after_std": 0.6181284599006176, "reward_before_mean": 0.27629107190296054, "reward_before_std": 0.5983157828450203, "reward_change_max": 0.0005147382616996765, "reward_change_mean": -0.4684114558622241, "reward_change_min": -0.8724917247891426, "reward_change_std": 0.3435290567576885, "reward_std": 0.6181284710764885, "rewards/cosine_scaled_reward": -0.20560447499155998, "rewards/format_reward": 0.6875000223517418, "step": 334 }, { "advantage_max": 1.7195413634181023, "advantage_mean": -7.450580818968433e-09, "advantage_min": -0.7100030183792114, "advantage_std": 0.9040074683725834, "completion_length": 2427.020866394043, "epoch": 0.38285714285714284, "grad_norm": 2.7406511306762695, "kl": 0.7989501953125, "lambda_div_used": 0.5, "learning_rate": 3.6696851061588994e-07, "loss": 0.0033, "reward": 0.16293947119265795, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16293947119265795, "reward_after_std": 0.9040074795484543, "reward_before_mean": 0.8333401568233967, "reward_before_std": 0.8052664678543806, "reward_change_max": 7.398426532745361e-05, "reward_change_mean": -0.6704006977379322, "reward_change_min": -1.190424356609583, "reward_change_std": 0.46814507246017456, "reward_std": 0.9040075056254864, "rewards/cosine_scaled_reward": 0.031253403052687645, "rewards/format_reward": 0.7708333432674408, "step": 335 }, { "advantage_max": 1.7609595283865929, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.7816978171467781, "advantage_std": 0.9407963380217552, "completion_length": 2901.854202270508, "epoch": 0.384, "grad_norm": 0.7952771186828613, "kl": 0.3408203125, "lambda_div_used": 0.5, "learning_rate": 3.641030065789562e-07, "loss": 0.0545, "reward": 0.08905918773962185, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08905918773962185, "reward_after_std": 0.9407963454723358, "reward_before_mean": 0.6842526560649276, "reward_before_std": 0.9304242916405201, "reward_change_max": 0.0, "reward_change_mean": -0.5951934605836868, "reward_change_min": -1.1929472386837006, "reward_change_std": 0.4584904685616493, "reward_std": 0.9407963864505291, "rewards/cosine_scaled_reward": -0.0016236957162618637, "rewards/format_reward": 0.6875000298023224, "step": 336 }, { "advantage_max": 1.602539524435997, "advantage_mean": -8.692344399818808e-09, "advantage_min": -0.6270337104797363, "advantage_std": 0.8279041424393654, "completion_length": 2545.7084045410156, "epoch": 0.3851428571428571, "grad_norm": 1.1055015325546265, "kl": 0.306396484375, "lambda_div_used": 0.5, "learning_rate": 3.612465628992203e-07, "loss": 0.0811, "reward": -0.06436791177839041, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.06436791177839041, "reward_after_std": 0.8279041349887848, "reward_before_mean": 0.43131764233112335, "reward_before_std": 0.756643932312727, "reward_change_max": 0.0, "reward_change_mean": -0.4956855494529009, "reward_change_min": -0.8966120667755604, "reward_change_std": 0.3453631680458784, "reward_std": 0.8279041722416878, "rewards/cosine_scaled_reward": -0.16975786164402962, "rewards/format_reward": 0.7708333488553762, "step": 337 }, { "advantage_max": 1.4630458503961563, "advantage_mean": 6.829699084054397e-09, "advantage_min": -0.7494680806994438, "advantage_std": 0.7806403860449791, "completion_length": 2262.729232788086, "epoch": 0.3862857142857143, "grad_norm": 0.7859777808189392, "kl": 0.29241943359375, "lambda_div_used": 0.5, "learning_rate": 3.5839931879571725e-07, "loss": 0.0578, "reward": 0.06579644477460533, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06579644477460533, "reward_after_std": 0.7806403860449791, "reward_before_mean": 0.6918848976492882, "reward_before_std": 0.73706915974617, "reward_change_max": 0.0, "reward_change_mean": -0.6260884515941143, "reward_change_min": -1.0630837492644787, "reward_change_std": 0.42817062325775623, "reward_std": 0.7806403934955597, "rewards/cosine_scaled_reward": -0.049890896305441856, "rewards/format_reward": 0.7916666772216558, "step": 338 }, { "advantage_max": 1.36690903455019, "advantage_mean": 6.829698806498641e-09, "advantage_min": -0.5567984506487846, "advantage_std": 0.7089659981429577, "completion_length": 3122.5000915527344, "epoch": 0.38742857142857146, "grad_norm": 0.6798500418663025, "kl": 0.3626708984375, "lambda_div_used": 0.5, "learning_rate": 3.555614130391079e-07, "loss": 0.0192, "reward": -0.16638006269931793, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.16638006269931793, "reward_after_std": 0.7089660130441189, "reward_before_mean": 0.2861324343830347, "reward_before_std": 0.6491104103624821, "reward_change_max": 0.00011374801397323608, "reward_change_mean": -0.45251248590648174, "reward_change_min": -0.8402752205729485, "reward_change_std": 0.31217301823198795, "reward_std": 0.7089660204946995, "rewards/cosine_scaled_reward": -0.1277671225834638, "rewards/format_reward": 0.5416666772216558, "step": 339 }, { "advantage_max": 1.7433002442121506, "advantage_mean": -4.967053990334591e-09, "advantage_min": -0.7869556918740273, "advantage_std": 0.9177567921578884, "completion_length": 2737.6458892822266, "epoch": 0.38857142857142857, "grad_norm": 0.716688871383667, "kl": 0.2950439453125, "lambda_div_used": 0.5, "learning_rate": 3.5273298394491515e-07, "loss": 0.0255, "reward": 0.09202966094017029, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09202966094017029, "reward_after_std": 0.9177567958831787, "reward_before_mean": 0.6951882378198206, "reward_before_std": 0.8705759271979332, "reward_change_max": 0.0, "reward_change_mean": -0.6031586118042469, "reward_change_min": -1.0886082351207733, "reward_change_std": 0.4399815835058689, "reward_std": 0.917756836861372, "rewards/cosine_scaled_reward": -0.048239219933748245, "rewards/format_reward": 0.7916666865348816, "step": 340 }, { "advantage_max": 1.7033798545598984, "advantage_mean": -2.5456151409031236e-08, "advantage_min": -0.7195212692022324, "advantage_std": 0.8980820775032043, "completion_length": 2603.354217529297, "epoch": 0.38971428571428574, "grad_norm": 0.9158607721328735, "kl": 0.24871826171875, "lambda_div_used": 0.5, "learning_rate": 3.4991416936678276e-07, "loss": 0.0565, "reward": 0.345290195196867, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.345290195196867, "reward_after_std": 0.8980820700526237, "reward_before_mean": 1.1627929043024778, "reward_before_std": 0.7433993555605412, "reward_change_max": 0.0, "reward_change_mean": -0.8175027351826429, "reward_change_min": -1.348432257771492, "reward_change_std": 0.5476555228233337, "reward_std": 0.8980820924043655, "rewards/cosine_scaled_reward": 0.23764644749462605, "rewards/format_reward": 0.6875000055879354, "step": 341 }, { "advantage_max": 1.687784269452095, "advantage_mean": -1.4901161637936866e-08, "advantage_min": -0.7531973719596863, "advantage_std": 0.8736945390701294, "completion_length": 2862.9375915527344, "epoch": 0.39085714285714285, "grad_norm": 0.9657043218612671, "kl": 0.439453125, "lambda_div_used": 0.5, "learning_rate": 3.471051066897562e-07, "loss": 0.0755, "reward": 0.04618118858593334, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.04618118858593334, "reward_after_std": 0.8736945353448391, "reward_before_mean": 0.6218621619045734, "reward_before_std": 0.7942062020301819, "reward_change_max": 0.0, "reward_change_mean": -0.5756809897720814, "reward_change_min": -1.013890691101551, "reward_change_std": 0.397940494120121, "reward_std": 0.8736945502460003, "rewards/cosine_scaled_reward": -0.07448559207841754, "rewards/format_reward": 0.770833358168602, "step": 342 }, { "advantage_max": 1.596488393843174, "advantage_mean": 1.1796752963366686e-08, "advantage_min": -0.7924398183822632, "advantage_std": 0.8561305105686188, "completion_length": 2894.2083740234375, "epoch": 0.392, "grad_norm": 0.913608968257904, "kl": 0.294189453125, "lambda_div_used": 0.5, "learning_rate": 3.4430593282358777e-07, "loss": 0.0409, "reward": 0.14228842593729496, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14228842593729496, "reward_after_std": 0.8561305180191994, "reward_before_mean": 0.8087989874184132, "reward_before_std": 0.8190572299063206, "reward_change_max": 0.0003446340560913086, "reward_change_mean": -0.6665105260908604, "reward_change_min": -1.2105020619928837, "reward_change_std": 0.48955480568110943, "reward_std": 0.8561305701732635, "rewards/cosine_scaled_reward": 0.07106614392250776, "rewards/format_reward": 0.666666679084301, "step": 343 }, { "advantage_max": 1.4609468877315521, "advantage_mean": -2.297262396977473e-08, "advantage_min": -0.6554704532027245, "advantage_std": 0.7672437131404877, "completion_length": 2265.104232788086, "epoch": 0.3931428571428571, "grad_norm": 0.38663792610168457, "kl": 0.217529296875, "lambda_div_used": 0.5, "learning_rate": 3.4151678419606233e-07, "loss": 0.0095, "reward": 0.32306694984436035, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.32306694984436035, "reward_after_std": 0.7672437205910683, "reward_before_mean": 1.1657235862221569, "reward_before_std": 0.5834860354661942, "reward_change_max": 0.0020352303981781006, "reward_change_mean": -0.8426566086709499, "reward_change_min": -1.2558316215872765, "reward_change_std": 0.5019741114228964, "reward_std": 0.7672437354922295, "rewards/cosine_scaled_reward": 0.1661950871348381, "rewards/format_reward": 0.8333333395421505, "step": 344 }, { "advantage_max": 1.7725291848182678, "advantage_mean": -1.4280279792000528e-08, "advantage_min": -0.8514701277017593, "advantage_std": 0.9321935474872589, "completion_length": 2641.916778564453, "epoch": 0.3942857142857143, "grad_norm": 0.6002000570297241, "kl": 0.28643798828125, "lambda_div_used": 0.5, "learning_rate": 3.387377967463493e-07, "loss": 0.0045, "reward": 0.20617245603352785, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20617245603352785, "reward_after_std": 0.9321935623884201, "reward_before_mean": 0.9006627351045609, "reward_before_std": 0.8598495684564114, "reward_change_max": 0.0, "reward_change_mean": -0.6944902688264847, "reward_change_min": -1.2238090112805367, "reward_change_std": 0.4703991822898388, "reward_std": 0.9321935623884201, "rewards/cosine_scaled_reward": 0.07533134613186121, "rewards/format_reward": 0.7500000223517418, "step": 345 }, { "advantage_max": 1.2814199030399323, "advantage_mean": 3.7252904094842165e-09, "advantage_min": -0.6667215526103973, "advantage_std": 0.6843202896416187, "completion_length": 2701.104232788086, "epoch": 0.3954285714285714, "grad_norm": 0.2635735273361206, "kl": 0.3524169921875, "lambda_div_used": 0.5, "learning_rate": 3.359691059183761e-07, "loss": 0.0427, "reward": -0.021729059517383575, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.021729059517383575, "reward_after_std": 0.6843202896416187, "reward_before_mean": 0.5648492462933064, "reward_before_std": 0.6315851099789143, "reward_change_max": 0.0006144046783447266, "reward_change_mean": -0.5865782834589481, "reward_change_min": -1.0161477029323578, "reward_change_std": 0.39074820280075073, "reward_std": 0.6843203119933605, "rewards/cosine_scaled_reward": -0.10299206525087357, "rewards/format_reward": 0.7708333507180214, "step": 346 }, { "advantage_max": 1.352771744132042, "advantage_mean": -3.104408563547878e-09, "advantage_min": -0.5322981774806976, "advantage_std": 0.6943789683282375, "completion_length": 2622.1875610351562, "epoch": 0.3965714285714286, "grad_norm": 1.1750766038894653, "kl": 0.292724609375, "lambda_div_used": 0.5, "learning_rate": 3.3321084665422803e-07, "loss": -0.0206, "reward": 0.014769105706363916, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.014769105706363916, "reward_after_std": 0.6943789832293987, "reward_before_mean": 0.6197219397872686, "reward_before_std": 0.5595865100622177, "reward_change_max": 0.0, "reward_change_mean": -0.6049528494477272, "reward_change_min": -1.0069943517446518, "reward_change_std": 0.3644682914018631, "reward_std": 0.6943789906799793, "rewards/cosine_scaled_reward": -0.12763904221355915, "rewards/format_reward": 0.8750000149011612, "step": 347 }, { "advantage_max": 1.5380387529730797, "advantage_mean": -1.490116224855953e-08, "advantage_min": -0.6307278983294964, "advantage_std": 0.7947516813874245, "completion_length": 2490.7500610351562, "epoch": 0.3977142857142857, "grad_norm": 0.3507033586502075, "kl": 0.315765380859375, "lambda_div_used": 0.5, "learning_rate": 3.3046315338757026e-07, "loss": 0.0396, "reward": 0.14277693210169673, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14277693210169673, "reward_after_std": 0.7947516664862633, "reward_before_mean": 0.8233021963387728, "reward_before_std": 0.6358652543276548, "reward_change_max": 0.0, "reward_change_mean": -0.6805252507328987, "reward_change_min": -1.0959210619330406, "reward_change_std": 0.4228264205157757, "reward_std": 0.7947516813874245, "rewards/cosine_scaled_reward": -0.015432262793183327, "rewards/format_reward": 0.8541666716337204, "step": 348 }, { "advantage_max": 1.4082676097750664, "advantage_mean": 1.490116141589226e-08, "advantage_min": -0.6084829457104206, "advantage_std": 0.7429373823106289, "completion_length": 2923.291717529297, "epoch": 0.39885714285714285, "grad_norm": 0.7672387361526489, "kl": 0.4090576171875, "lambda_div_used": 0.5, "learning_rate": 3.2772616003709616e-07, "loss": 0.0578, "reward": -0.14835473091807216, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.14835473091807216, "reward_after_std": 0.7429374102503061, "reward_before_mean": 0.3137938645668328, "reward_before_std": 0.7114957068115473, "reward_change_max": 0.00020164251327514648, "reward_change_mean": -0.46214855602011085, "reward_change_min": -0.9401774033904076, "reward_change_std": 0.35716398153454065, "reward_std": 0.7429374195635319, "rewards/cosine_scaled_reward": -0.14518641866743565, "rewards/format_reward": 0.6041666828095913, "step": 349 }, { "advantage_max": 1.752659372985363, "advantage_mean": 1.4280279847511679e-08, "advantage_min": -0.6503000631928444, "advantage_std": 0.8873385712504387, "completion_length": 2393.0000915527344, "epoch": 0.4, "grad_norm": 0.501510500907898, "kl": 0.3831787109375, "lambda_div_used": 0.5, "learning_rate": 3.250000000000001e-07, "loss": 0.028, "reward": 0.06549177691340446, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.06549177691340446, "reward_after_std": 0.8873385488986969, "reward_before_mean": 0.6462145633995533, "reward_before_std": 0.724578857421875, "reward_change_max": 0.0005541294813156128, "reward_change_mean": -0.5807227715849876, "reward_change_min": -0.9543578177690506, "reward_change_std": 0.3548112027347088, "reward_std": 0.8873385824263096, "rewards/cosine_scaled_reward": -0.08314273924042936, "rewards/format_reward": 0.8125000111758709, "step": 350 }, { "advantage_max": 1.7768183425068855, "advantage_mean": -1.1796752963366686e-08, "advantage_min": -0.7786560505628586, "advantage_std": 0.944592297077179, "completion_length": 2501.250030517578, "epoch": 0.40114285714285713, "grad_norm": 0.3541778028011322, "kl": 0.29620361328125, "lambda_div_used": 0.5, "learning_rate": 3.222848061454764e-07, "loss": 0.0245, "reward": 0.1836626399308443, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1836626399308443, "reward_after_std": 0.944592297077179, "reward_before_mean": 0.8567006252706051, "reward_before_std": 0.8989718146622181, "reward_change_max": 0.0011807605624198914, "reward_change_mean": -0.6730380021035671, "reward_change_min": -1.1930915638804436, "reward_change_std": 0.481119092553854, "reward_std": 0.944592297077179, "rewards/cosine_scaled_reward": 0.011683644726872444, "rewards/format_reward": 0.8333333414047956, "step": 351 }, { "advantage_max": 1.2776615843176842, "advantage_mean": -9.313225579621331e-09, "advantage_min": -0.6007811687886715, "advantage_std": 0.678571205586195, "completion_length": 2365.375045776367, "epoch": 0.4022857142857143, "grad_norm": 0.9657369256019592, "kl": 0.3677520751953125, "lambda_div_used": 0.5, "learning_rate": 3.195807108082429e-07, "loss": 0.0034, "reward": -0.07690786942839622, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07690786942839622, "reward_after_std": 0.6785712130367756, "reward_before_mean": 0.4603999052196741, "reward_before_std": 0.6251944825053215, "reward_change_max": 0.00337374210357666, "reward_change_mean": -0.5373077914118767, "reward_change_min": -0.9584220610558987, "reward_change_std": 0.3818345069885254, "reward_std": 0.6785712391138077, "rewards/cosine_scaled_reward": -0.05105004645884037, "rewards/format_reward": 0.5625000093132257, "step": 352 }, { "advantage_max": 1.450112447142601, "advantage_mean": 1.8626452047421083e-09, "advantage_min": -0.5225486978888512, "advantage_std": 0.7335505895316601, "completion_length": 2045.3958854675293, "epoch": 0.4034285714285714, "grad_norm": 0.47462838888168335, "kl": 0.257476806640625, "lambda_div_used": 0.5, "learning_rate": 3.168878457820915e-07, "loss": 0.0158, "reward": 0.1838024971075356, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1838024971075356, "reward_after_std": 0.7335505895316601, "reward_before_mean": 0.9056749492883682, "reward_before_std": 0.5095305219292641, "reward_change_max": 0.0010958164930343628, "reward_change_mean": -0.7218724116683006, "reward_change_min": -1.0607609003782272, "reward_change_std": 0.40031613036990166, "reward_std": 0.7335506342351437, "rewards/cosine_scaled_reward": 0.015337456949055195, "rewards/format_reward": 0.8750000074505806, "step": 353 }, { "advantage_max": 1.4784216433763504, "advantage_mean": 1.0554989549049765e-08, "advantage_min": -0.6035023629665375, "advantage_std": 0.7698302268981934, "completion_length": 2046.0833892822266, "epoch": 0.4045714285714286, "grad_norm": 0.22698451578617096, "kl": 0.18927001953125, "lambda_div_used": 0.5, "learning_rate": 3.142063423134644e-07, "loss": 0.0088, "reward": 0.19753902312368155, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19753902312368155, "reward_after_std": 0.7698302119970322, "reward_before_mean": 0.9317657127976418, "reward_before_std": 0.6076288931071758, "reward_change_max": 0.0, "reward_change_mean": -0.734226655215025, "reward_change_min": -1.204187534749508, "reward_change_std": 0.4471469521522522, "reward_std": 0.7698302567005157, "rewards/cosine_scaled_reward": 0.03879951499402523, "rewards/format_reward": 0.8541666716337204, "step": 354 }, { "advantage_max": 2.0405396223068237, "advantage_mean": -6.208817904251873e-10, "advantage_min": -0.8608497157692909, "advantage_std": 1.0653215050697327, "completion_length": 2223.6458587646484, "epoch": 0.4057142857142857, "grad_norm": 0.48912322521209717, "kl": 0.2689208984375, "lambda_div_used": 0.5, "learning_rate": 3.115363310950578e-07, "loss": 0.0411, "reward": 0.24965599924325943, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.24965599924325943, "reward_after_std": 1.0653215199708939, "reward_before_mean": 0.9300570599734783, "reward_before_std": 0.9955869130790234, "reward_change_max": 0.0, "reward_change_mean": -0.6804010719060898, "reward_change_min": -1.240929253399372, "reward_change_std": 0.4811771549284458, "reward_std": 1.0653215497732162, "rewards/cosine_scaled_reward": 0.027528513222932816, "rewards/format_reward": 0.8750000111758709, "step": 355 }, { "advantage_max": 1.5031840428709984, "advantage_mean": -1.428027990302283e-08, "advantage_min": -0.7539886124432087, "advantage_std": 0.7986224293708801, "completion_length": 2421.729248046875, "epoch": 0.40685714285714286, "grad_norm": 0.6188843250274658, "kl": 0.25262451171875, "lambda_div_used": 0.5, "learning_rate": 3.0887794225945143e-07, "loss": 0.0175, "reward": 0.19870756931777578, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.19870756931777578, "reward_after_std": 0.7986224070191383, "reward_before_mean": 0.9287926075048745, "reward_before_std": 0.713905394077301, "reward_change_max": 0.0, "reward_change_mean": -0.7300850711762905, "reward_change_min": -1.2293099090456963, "reward_change_std": 0.4734327495098114, "reward_std": 0.7986224070191383, "rewards/cosine_scaled_reward": 0.04772963561117649, "rewards/format_reward": 0.8333333395421505, "step": 356 }, { "advantage_max": 1.3350956961512566, "advantage_mean": -1.3659398390153399e-08, "advantage_min": -0.633977860212326, "advantage_std": 0.6937114223837852, "completion_length": 2763.125045776367, "epoch": 0.408, "grad_norm": 0.7168110013008118, "kl": 0.2630615234375, "lambda_div_used": 0.5, "learning_rate": 3.062313053727671e-07, "loss": 0.0174, "reward": 0.08040890609845519, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08040890609845519, "reward_after_std": 0.6937114372849464, "reward_before_mean": 0.7407789751887321, "reward_before_std": 0.5677717514336109, "reward_change_max": 0.0, "reward_change_mean": -0.6603700965642929, "reward_change_min": -0.9959189742803574, "reward_change_std": 0.3936575651168823, "reward_std": 0.6937114521861076, "rewards/cosine_scaled_reward": -0.08794385753571987, "rewards/format_reward": 0.9166666865348816, "step": 357 }, { "advantage_max": 1.592109739780426, "advantage_mean": 4.967053712778835e-09, "advantage_min": -0.716572854667902, "advantage_std": 0.8424624130129814, "completion_length": 1910.2292022705078, "epoch": 0.40914285714285714, "grad_norm": 0.42959484457969666, "kl": 0.201019287109375, "lambda_div_used": 0.5, "learning_rate": 3.0359654942835247e-07, "loss": 0.0346, "reward": 0.24396879551932216, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24396879551932216, "reward_after_std": 0.8424624279141426, "reward_before_mean": 0.9911262951791286, "reward_before_std": 0.7452986799180508, "reward_change_max": 0.0, "reward_change_mean": -0.7471575364470482, "reward_change_min": -1.3087777346372604, "reward_change_std": 0.48442544788122177, "reward_std": 0.8424624502658844, "rewards/cosine_scaled_reward": 0.11014649923890829, "rewards/format_reward": 0.770833345130086, "step": 358 }, { "advantage_max": 1.335417702794075, "advantage_mean": 6.829698862009792e-09, "advantage_min": -0.6110520102083683, "advantage_std": 0.7061552852392197, "completion_length": 2318.479248046875, "epoch": 0.4102857142857143, "grad_norm": 1.0732530355453491, "kl": 0.234130859375, "lambda_div_used": 0.5, "learning_rate": 3.0097380284049523e-07, "loss": -0.0101, "reward": 0.0727979297953425, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0727979297953425, "reward_after_std": 0.70615528896451, "reward_before_mean": 0.7263930886983871, "reward_before_std": 0.6077985875308514, "reward_change_max": 0.0003136545419692993, "reward_change_mean": -0.6535951718688011, "reward_change_min": -1.060469426214695, "reward_change_std": 0.41288536973297596, "reward_std": 0.70615528896451, "rewards/cosine_scaled_reward": -0.07430345751345158, "rewards/format_reward": 0.8750000149011612, "step": 359 }, { "advantage_max": 1.9765265434980392, "advantage_mean": -2.1109978876054925e-08, "advantage_min": -0.9272864870727062, "advantage_std": 1.044951420277357, "completion_length": 2600.2291870117188, "epoch": 0.4114285714285714, "grad_norm": 0.5990117192268372, "kl": 0.23577880859375, "lambda_div_used": 0.5, "learning_rate": 2.9836319343816397e-07, "loss": 0.024, "reward": 0.38619683496654034, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.38619683496654034, "reward_after_std": 1.044951420277357, "reward_before_mean": 1.1979414029046893, "reward_before_std": 0.9707056246697903, "reward_change_max": 0.0, "reward_change_mean": -0.8117445930838585, "reward_change_min": -1.418171539902687, "reward_change_std": 0.5478472858667374, "reward_std": 1.044951420277357, "rewards/cosine_scaled_reward": 0.1302206851541996, "rewards/format_reward": 0.9375000149011612, "step": 360 }, { "advantage_max": 1.6750199496746063, "advantage_mean": 6.208820124697922e-10, "advantage_min": -0.7383632361888885, "advantage_std": 0.8688630610704422, "completion_length": 2427.5834350585938, "epoch": 0.4125714285714286, "grad_norm": 0.7121224403381348, "kl": 0.23345947265625, "lambda_div_used": 0.5, "learning_rate": 2.9576484845877793e-07, "loss": 0.0544, "reward": 0.19519370747730136, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19519370747730136, "reward_after_std": 0.8688630871474743, "reward_before_mean": 0.8941241502761841, "reward_before_std": 0.744682066142559, "reward_change_max": 0.0008524805307388306, "reward_change_mean": -0.698930449783802, "reward_change_min": -1.1726604774594307, "reward_change_std": 0.4409833699464798, "reward_std": 0.8688631132245064, "rewards/cosine_scaled_reward": 0.009562073741108179, "rewards/format_reward": 0.8750000149011612, "step": 361 }, { "advantage_max": 1.3919198587536812, "advantage_mean": -5.587935336670569e-09, "advantage_min": -0.5790497735142708, "advantage_std": 0.7130658477544785, "completion_length": 1741.7708892822266, "epoch": 0.4137142857142857, "grad_norm": 0.450083464384079, "kl": 0.19146728515625, "lambda_div_used": 0.5, "learning_rate": 2.931788945420058e-07, "loss": 0.0206, "reward": 0.21085366362240165, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.21085366362240165, "reward_after_std": 0.7130658328533173, "reward_before_mean": 0.9695420237258077, "reward_before_std": 0.5190842002630234, "reward_change_max": 0.0, "reward_change_mean": -0.7586883679032326, "reward_change_min": -1.1238619238138199, "reward_change_std": 0.4279701504856348, "reward_std": 0.7130658328533173, "rewards/cosine_scaled_reward": 0.0472710095345974, "rewards/format_reward": 0.8750000111758709, "step": 362 }, { "advantage_max": 1.424420714378357, "advantage_mean": -1.1102230246251565e-16, "advantage_min": -0.5779132470488548, "advantage_std": 0.7376838810741901, "completion_length": 1992.645881652832, "epoch": 0.41485714285714287, "grad_norm": 0.5427188873291016, "kl": 0.257598876953125, "lambda_div_used": 0.5, "learning_rate": 2.9060545772359305e-07, "loss": -0.0058, "reward": 0.0010278723202645779, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0010278723202645779, "reward_after_std": 0.7376838736236095, "reward_before_mean": 0.5802722265943885, "reward_before_std": 0.6190046742558479, "reward_change_max": 0.00023803859949111938, "reward_change_mean": -0.5792443305253983, "reward_change_min": -0.9397610351443291, "reward_change_std": 0.3671391997486353, "reward_std": 0.737683892250061, "rewards/cosine_scaled_reward": -0.04319723695516586, "rewards/format_reward": 0.6666666846722364, "step": 363 }, { "advantage_max": 1.311921313405037, "advantage_mean": -8.071462553882469e-09, "advantage_min": -0.5705104358494282, "advantage_std": 0.6872405484318733, "completion_length": 2710.31258392334, "epoch": 0.416, "grad_norm": 1.0595557689666748, "kl": 0.29632568359375, "lambda_div_used": 0.5, "learning_rate": 2.8804466342921987e-07, "loss": -0.0062, "reward": -0.25245581939816475, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.25245581939816475, "reward_after_std": 0.6872405298054218, "reward_before_mean": 0.1285827998071909, "reward_before_std": 0.658259104937315, "reward_change_max": 0.0, "reward_change_mean": -0.3810386322438717, "reward_change_min": -0.6986820474267006, "reward_change_std": 0.2910507880151272, "reward_std": 0.6872405484318733, "rewards/cosine_scaled_reward": -0.23779193311929703, "rewards/format_reward": 0.6041666772216558, "step": 364 }, { "advantage_max": 1.4988937079906464, "advantage_mean": -7.450581263057643e-09, "advantage_min": -0.6771756447851658, "advantage_std": 0.7947739884257317, "completion_length": 2861.166732788086, "epoch": 0.41714285714285715, "grad_norm": 0.3315470218658447, "kl": 0.22308349609375, "lambda_div_used": 0.5, "learning_rate": 2.854966364683872e-07, "loss": 0.0188, "reward": 0.1647136379033327, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1647136379033327, "reward_after_std": 0.7947739958763123, "reward_before_mean": 0.8680481066694483, "reward_before_std": 0.7035925425589085, "reward_change_max": 0.00020241737365722656, "reward_change_mean": -0.7033344469964504, "reward_change_min": -1.2183981984853745, "reward_change_std": 0.4677523523569107, "reward_std": 0.7947740480303764, "rewards/cosine_scaled_reward": 0.0069406908005476, "rewards/format_reward": 0.8541666716337204, "step": 365 }, { "advantage_max": 1.6599657125771046, "advantage_mean": -2.4524827946237338e-08, "advantage_min": -0.7904684916138649, "advantage_std": 0.89173923432827, "completion_length": 2031.7500457763672, "epoch": 0.41828571428571426, "grad_norm": 0.7609602212905884, "kl": 0.1444244384765625, "lambda_div_used": 0.5, "learning_rate": 2.829615010283344e-07, "loss": -0.0237, "reward": 0.24849661067128181, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.24849661067128181, "reward_after_std": 0.8917392194271088, "reward_before_mean": 0.9921067655086517, "reward_before_std": 0.8401160351932049, "reward_change_max": 0.0, "reward_change_mean": -0.7436101827770472, "reward_change_min": -1.3646418452262878, "reward_change_std": 0.5249740164726973, "reward_std": 0.8917392492294312, "rewards/cosine_scaled_reward": 0.15230336226522923, "rewards/format_reward": 0.6875000149011612, "step": 366 }, { "advantage_max": 1.6305503770709038, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.7210628725588322, "advantage_std": 0.8612675108015537, "completion_length": 2874.041732788086, "epoch": 0.41942857142857143, "grad_norm": 1.068894624710083, "kl": 0.2226409912109375, "lambda_div_used": 0.5, "learning_rate": 2.8043938066798645e-07, "loss": 0.063, "reward": -0.04002854856662452, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.04002854856662452, "reward_after_std": 0.8612675480544567, "reward_before_mean": 0.4689618442207575, "reward_before_std": 0.8422180972993374, "reward_change_max": 0.0019412413239479065, "reward_change_mean": -0.5089903902262449, "reward_change_min": -0.9655529074370861, "reward_change_std": 0.3873988389968872, "reward_std": 0.8612675666809082, "rewards/cosine_scaled_reward": -0.06760243279859424, "rewards/format_reward": 0.6041666809469461, "step": 367 }, { "advantage_max": 1.5631348192691803, "advantage_mean": -2.483527106189598e-09, "advantage_min": -0.6437996104359627, "advantage_std": 0.8016529567539692, "completion_length": 2851.6459045410156, "epoch": 0.4205714285714286, "grad_norm": 0.7713169455528259, "kl": 0.19476318359375, "lambda_div_used": 0.5, "learning_rate": 2.7793039831193133e-07, "loss": 0.0495, "reward": -0.07841504114912823, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07841504114912823, "reward_after_std": 0.8016529642045498, "reward_before_mean": 0.41246860893443227, "reward_before_std": 0.7121658660471439, "reward_change_max": 0.0004137009382247925, "reward_change_mean": -0.4908836465328932, "reward_change_min": -0.7905861400067806, "reward_change_std": 0.31734895519912243, "reward_std": 0.801652979105711, "rewards/cosine_scaled_reward": -0.09584904834628105, "rewards/format_reward": 0.6041666753590107, "step": 368 }, { "advantage_max": 1.8276142477989197, "advantage_mean": -2.2351742789972207e-08, "advantage_min": -0.8692605495452881, "advantage_std": 0.9705227017402649, "completion_length": 2686.7709350585938, "epoch": 0.4217142857142857, "grad_norm": 0.6053704619407654, "kl": 0.17791748046875, "lambda_div_used": 0.5, "learning_rate": 2.7543467624442956e-07, "loss": 0.0124, "reward": 0.264951853081584, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.264951853081584, "reward_after_std": 0.9705226868391037, "reward_before_mean": 0.9941436583176255, "reward_before_std": 0.9082366935908794, "reward_change_max": 0.0004303380846977234, "reward_change_mean": -0.7291918061673641, "reward_change_min": -1.2517874836921692, "reward_change_std": 0.4964812193065882, "reward_std": 0.9705227166414261, "rewards/cosine_scaled_reward": 0.09082182496786118, "rewards/format_reward": 0.8125000111758709, "step": 369 }, { "advantage_max": 1.2040704488754272, "advantage_mean": 6.8296991950766994e-09, "advantage_min": -0.5248738452792168, "advantage_std": 0.6303714476525784, "completion_length": 2822.75008392334, "epoch": 0.4228571428571429, "grad_norm": 0.30178558826446533, "kl": 0.2030181884765625, "lambda_div_used": 0.5, "learning_rate": 2.729523361034538e-07, "loss": 0.0336, "reward": -0.024024151323828846, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.024024151323828846, "reward_after_std": 0.630371455103159, "reward_before_mean": 0.573287246748805, "reward_before_std": 0.5030009597539902, "reward_change_max": 0.000520557165145874, "reward_change_mean": -0.5973114091902971, "reward_change_min": -0.9545686095952988, "reward_change_std": 0.382950097322464, "reward_std": 0.6303714849054813, "rewards/cosine_scaled_reward": -0.04668972175568342, "rewards/format_reward": 0.6666666809469461, "step": 370 }, { "advantage_max": 1.5700139477849007, "advantage_mean": -1.117587167254186e-08, "advantage_min": -0.7256846129894257, "advantage_std": 0.8189797066152096, "completion_length": 1951.1875228881836, "epoch": 0.424, "grad_norm": 0.39446350932121277, "kl": 0.1736907958984375, "lambda_div_used": 0.5, "learning_rate": 2.7048349887476037e-07, "loss": 0.0311, "reward": 0.2522589797154069, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2522589797154069, "reward_after_std": 0.8189797289669514, "reward_before_mean": 1.0146636981517076, "reward_before_std": 0.6875501796603203, "reward_change_max": 0.003395289182662964, "reward_change_mean": -0.7624046634882689, "reward_change_min": -1.200857788324356, "reward_change_std": 0.47052861377596855, "reward_std": 0.8189797662198544, "rewards/cosine_scaled_reward": 0.1114985030144453, "rewards/format_reward": 0.7916666846722364, "step": 371 }, { "advantage_max": 1.7297367379069328, "advantage_mean": -7.450580818968433e-09, "advantage_min": -0.7292061150074005, "advantage_std": 0.9062994085252285, "completion_length": 3032.854217529297, "epoch": 0.42514285714285716, "grad_norm": 0.546402633190155, "kl": 0.2222137451171875, "lambda_div_used": 0.5, "learning_rate": 2.6802828488599294e-07, "loss": 0.0465, "reward": 0.04156911559402943, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.04156911559402943, "reward_after_std": 0.9062994159758091, "reward_before_mean": 0.6037184139713645, "reward_before_std": 0.8658935464918613, "reward_change_max": 3.7536025047302246e-05, "reward_change_mean": -0.5621493104845285, "reward_change_min": -1.030746005475521, "reward_change_std": 0.40250076726078987, "reward_std": 0.9062994495034218, "rewards/cosine_scaled_reward": -0.02105746790766716, "rewards/format_reward": 0.6458333414047956, "step": 372 }, { "advantage_max": 1.5753349885344505, "advantage_mean": 8.071462664904772e-09, "advantage_min": -0.707958310842514, "advantage_std": 0.8195471204817295, "completion_length": 1873.8750381469727, "epoch": 0.42628571428571427, "grad_norm": 0.45761746168136597, "kl": 0.1535797119140625, "lambda_div_used": 0.5, "learning_rate": 2.655868138008171e-07, "loss": 0.0228, "reward": 0.09441595152020454, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.09441595152020454, "reward_after_std": 0.8195471279323101, "reward_before_mean": 0.7264794651418924, "reward_before_std": 0.7281909249722958, "reward_change_max": 0.0031985342502593994, "reward_change_mean": -0.6320635080337524, "reward_change_min": -1.0189965330064297, "reward_change_std": 0.4044807106256485, "reward_std": 0.8195471614599228, "rewards/cosine_scaled_reward": -0.04301029210910201, "rewards/format_reward": 0.8125000149011612, "step": 373 }, { "advantage_max": 1.6910846680402756, "advantage_mean": -1.1175871006408045e-08, "advantage_min": -0.8846486583352089, "advantage_std": 0.9025316834449768, "completion_length": 2393.2500610351562, "epoch": 0.42742857142857144, "grad_norm": 0.5066131353378296, "kl": 0.156524658203125, "lambda_div_used": 0.5, "learning_rate": 2.631592046130896e-07, "loss": 0.0508, "reward": 0.2205169820226729, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2205169820226729, "reward_after_std": 0.9025316908955574, "reward_before_mean": 0.9377603754401207, "reward_before_std": 0.8492592498660088, "reward_change_max": 0.0010761022567749023, "reward_change_mean": -0.7172434497624636, "reward_change_min": -1.2053956873714924, "reward_change_std": 0.497776135802269, "reward_std": 0.9025317057967186, "rewards/cosine_scaled_reward": 0.06263019423931837, "rewards/format_reward": 0.8125000223517418, "step": 374 }, { "advantage_max": 1.5614767000079155, "advantage_mean": -1.2417633588057697e-09, "advantage_min": -0.7359337955713272, "advantage_std": 0.8408215641975403, "completion_length": 2382.5000762939453, "epoch": 0.42857142857142855, "grad_norm": 0.6809465289115906, "kl": 0.1686859130859375, "lambda_div_used": 0.5, "learning_rate": 2.6074557564105724e-07, "loss": 0.0181, "reward": 0.1346543780528009, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1346543780528009, "reward_after_std": 0.8408215567469597, "reward_before_mean": 0.7986902371048927, "reward_before_std": 0.7959046922624111, "reward_change_max": 0.0, "reward_change_mean": -0.6640358716249466, "reward_change_min": -1.1981551013886929, "reward_change_std": 0.4808881878852844, "reward_std": 0.8408215865492821, "rewards/cosine_scaled_reward": 0.04517845343798399, "rewards/format_reward": 0.7083333414047956, "step": 375 }, { "advantage_max": 1.4033148437738419, "advantage_mean": 3.725290298461914e-09, "advantage_min": -0.508766308426857, "advantage_std": 0.7132799662649632, "completion_length": 2357.812530517578, "epoch": 0.4297142857142857, "grad_norm": 0.3875832259654999, "kl": 0.2390899658203125, "lambda_div_used": 0.5, "learning_rate": 2.583460445215911e-07, "loss": 0.0401, "reward": -0.024911897722631693, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.024911897722631693, "reward_after_std": 0.7132799625396729, "reward_before_mean": 0.5406988188624382, "reward_before_std": 0.57003128901124, "reward_change_max": 0.008424557745456696, "reward_change_mean": -0.5656107012182474, "reward_change_min": -0.927064124494791, "reward_change_std": 0.35486311838030815, "reward_std": 0.7132799699902534, "rewards/cosine_scaled_reward": -0.0942339263856411, "rewards/format_reward": 0.7291666753590107, "step": 376 }, { "advantage_max": 1.6421222761273384, "advantage_mean": -7.450580763457282e-09, "advantage_min": -0.8725541532039642, "advantage_std": 0.8950701281428337, "completion_length": 3138.166732788086, "epoch": 0.4308571428571429, "grad_norm": 1.4470289945602417, "kl": 0.2535400390625, "lambda_div_used": 0.5, "learning_rate": 2.5596072820445254e-07, "loss": 0.0475, "reward": 0.032397462986409664, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.032397462986409664, "reward_after_std": 0.8950701355934143, "reward_before_mean": 0.6025219317525625, "reward_before_std": 0.9364787600934505, "reward_change_max": 0.0008480995893478394, "reward_change_mean": -0.5701244231313467, "reward_change_min": -1.1732884608209133, "reward_change_std": 0.4806421175599098, "reward_std": 0.8950701430439949, "rewards/cosine_scaled_reward": -0.03207239834591746, "rewards/format_reward": 0.6666666902601719, "step": 377 }, { "advantage_max": 1.7775244414806366, "advantage_mean": -9.934107758624577e-09, "advantage_min": -0.7913789357990026, "advantage_std": 0.9315557107329369, "completion_length": 2286.7917251586914, "epoch": 0.432, "grad_norm": 0.6633191108703613, "kl": 0.189361572265625, "lambda_div_used": 0.5, "learning_rate": 2.5358974294659373e-07, "loss": 0.0556, "reward": 0.2488978197798133, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2488978197798133, "reward_after_std": 0.9315557107329369, "reward_before_mean": 0.9738697209395468, "reward_before_std": 0.8460350800305605, "reward_change_max": 0.00010737031698226929, "reward_change_mean": -0.7249718643724918, "reward_change_min": -1.23944041877985, "reward_change_std": 0.47084952518343925, "reward_std": 0.9315557405352592, "rewards/cosine_scaled_reward": 0.09110149601474404, "rewards/format_reward": 0.7916666772216558, "step": 378 }, { "advantage_max": 1.577236846089363, "advantage_mean": 1.3659398057086491e-08, "advantage_min": -0.7336809299886227, "advantage_std": 0.8440136685967445, "completion_length": 2862.6875610351562, "epoch": 0.43314285714285716, "grad_norm": 0.9866610765457153, "kl": 0.466796875, "lambda_div_used": 0.5, "learning_rate": 2.512332043064913e-07, "loss": 0.0855, "reward": -0.02831041906028986, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.02831041906028986, "reward_after_std": 0.8440136685967445, "reward_before_mean": 0.49924127757549286, "reward_before_std": 0.8402791954576969, "reward_change_max": 0.0014529749751091003, "reward_change_mean": -0.5275516845285892, "reward_change_min": -0.9985620677471161, "reward_change_std": 0.41906842961907387, "reward_std": 0.8440137207508087, "rewards/cosine_scaled_reward": -0.11496270447969437, "rewards/format_reward": 0.729166679084301, "step": 379 }, { "advantage_max": 1.4116209298372269, "advantage_mean": -1.2417632477834672e-09, "advantage_min": -0.6222640015184879, "advantage_std": 0.7395900189876556, "completion_length": 2217.3958892822266, "epoch": 0.4342857142857143, "grad_norm": 0.822210431098938, "kl": 0.193084716796875, "lambda_div_used": 0.5, "learning_rate": 2.488912271385139e-07, "loss": 0.0237, "reward": 0.06631459016352892, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06631459016352892, "reward_after_std": 0.7395900264382362, "reward_before_mean": 0.7042246758937836, "reward_before_std": 0.6468055509030819, "reward_change_max": 0.0, "reward_change_mean": -0.6379101015627384, "reward_change_min": -1.0936972200870514, "reward_change_std": 0.4125612024217844, "reward_std": 0.7395900562405586, "rewards/cosine_scaled_reward": -0.04372099880129099, "rewards/format_reward": 0.7916666753590107, "step": 380 }, { "advantage_max": 1.4688269421458244, "advantage_mean": -6.829698917520943e-09, "advantage_min": -0.548859566450119, "advantage_std": 0.7594586610794067, "completion_length": 2741.0417709350586, "epoch": 0.43542857142857144, "grad_norm": 0.5852844715118408, "kl": 0.35443115234375, "lambda_div_used": 0.5, "learning_rate": 2.465639255873246e-07, "loss": 0.0173, "reward": -0.06746639730408788, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.06746639730408788, "reward_after_std": 0.7594586685299873, "reward_before_mean": 0.44514533365145326, "reward_before_std": 0.6602870542556047, "reward_change_max": 0.00012650340795516968, "reward_change_mean": -0.5126117654144764, "reward_change_min": -0.8934906348586082, "reward_change_std": 0.34667243622243404, "reward_std": 0.7594587020576, "rewards/cosine_scaled_reward": -0.12117732595652342, "rewards/format_reward": 0.6875000055879354, "step": 381 }, { "advantage_max": 1.1954265832901, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -0.4305326081812382, "advantage_std": 0.6082721762359142, "completion_length": 2293.479232788086, "epoch": 0.43657142857142855, "grad_norm": 0.45443591475486755, "kl": 0.295379638671875, "lambda_div_used": 0.5, "learning_rate": 2.4425141308231765e-07, "loss": 0.0369, "reward": -0.11909189762081951, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.11909189762081951, "reward_after_std": 0.6082722060382366, "reward_before_mean": 0.39205896970815957, "reward_before_std": 0.4652012325823307, "reward_change_max": 0.0, "reward_change_mean": -0.5111508592963219, "reward_change_min": -0.8269638493657112, "reward_change_std": 0.29517384245991707, "reward_std": 0.6082722283899784, "rewards/cosine_scaled_reward": -0.24147052597254515, "rewards/format_reward": 0.8750000074505806, "step": 382 }, { "advantage_max": 1.583274468779564, "advantage_mean": 2.7755575615628914e-16, "advantage_min": -0.7815160192549229, "advantage_std": 0.8540562726557255, "completion_length": 2804.7709350585938, "epoch": 0.4377142857142857, "grad_norm": 0.3750695586204529, "kl": 0.305908203125, "lambda_div_used": 0.5, "learning_rate": 2.4195380233209006e-07, "loss": 0.0362, "reward": 0.13424018677324057, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13424018677324057, "reward_after_std": 0.8540562726557255, "reward_before_mean": 0.7994630765169859, "reward_before_std": 0.830976240336895, "reward_change_max": 0.0006273016333580017, "reward_change_mean": -0.6652229018509388, "reward_change_min": -1.246145885437727, "reward_change_std": 0.4925944656133652, "reward_std": 0.85405632853508, "rewards/cosine_scaled_reward": 0.024731531739234924, "rewards/format_reward": 0.7500000037252903, "step": 383 }, { "advantage_max": 1.7960882484912872, "advantage_mean": -6.829699084054397e-09, "advantage_min": -0.6961954347789288, "advantage_std": 0.9222677126526833, "completion_length": 1781.5208892822266, "epoch": 0.43885714285714283, "grad_norm": 0.8119909763336182, "kl": 0.182159423828125, "lambda_div_used": 0.5, "learning_rate": 2.3967120531894857e-07, "loss": -0.0102, "reward": 0.4542910009622574, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4542910009622574, "reward_after_std": 0.9222677052021027, "reward_before_mean": 1.344484157860279, "reward_before_std": 0.7018172033131123, "reward_change_max": 0.0, "reward_change_mean": -0.8901931792497635, "reward_change_min": -1.4151117950677872, "reward_change_std": 0.5127660743892193, "reward_std": 0.9222677275538445, "rewards/cosine_scaled_reward": 0.2764087514951825, "rewards/format_reward": 0.7916666772216558, "step": 384 }, { "advantage_max": 1.6422509998083115, "advantage_mean": -1.4435500406140278e-08, "advantage_min": -0.7044455334544182, "advantage_std": 0.8583366200327873, "completion_length": 2314.541748046875, "epoch": 0.44, "grad_norm": 0.5646221041679382, "kl": 0.2369384765625, "lambda_div_used": 0.5, "learning_rate": 2.374037332934512e-07, "loss": -0.0053, "reward": 0.029954310972243547, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.029954310972243547, "reward_after_std": 0.8583366423845291, "reward_before_mean": 0.597434401512146, "reward_before_std": 0.7965768575668335, "reward_change_max": 0.005019478499889374, "reward_change_mean": -0.5674800910055637, "reward_change_min": -1.0640814229846, "reward_change_std": 0.4066210687160492, "reward_std": 0.8583366572856903, "rewards/cosine_scaled_reward": -0.10753281530924141, "rewards/format_reward": 0.8125000149011612, "step": 385 }, { "advantage_max": 1.875676967203617, "advantage_mean": -3.2285851103708296e-08, "advantage_min": -0.614168468862772, "advantage_std": 0.9387934468686581, "completion_length": 2594.166717529297, "epoch": 0.44114285714285717, "grad_norm": 1.6637388467788696, "kl": 0.306884765625, "lambda_div_used": 0.5, "learning_rate": 2.3515149676898552e-07, "loss": 0.0001, "reward": 0.4086938863620162, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4086938863620162, "reward_after_std": 0.9387934617698193, "reward_before_mean": 1.2516701593995094, "reward_before_std": 0.633562033995986, "reward_change_max": 0.0, "reward_change_mean": -0.8429762609302998, "reward_change_min": -1.2401651069521904, "reward_change_std": 0.4739220403134823, "reward_std": 0.938793495297432, "rewards/cosine_scaled_reward": 0.1883350731804967, "rewards/format_reward": 0.8750000074505806, "step": 386 }, { "advantage_max": 1.226422742009163, "advantage_mean": -2.483526828633842e-09, "advantage_min": -0.5920934341847897, "advantage_std": 0.6487264335155487, "completion_length": 2671.6875610351562, "epoch": 0.4422857142857143, "grad_norm": 1.0913386344909668, "kl": 0.326934814453125, "lambda_div_used": 0.5, "learning_rate": 2.3291460551638237e-07, "loss": 0.0016, "reward": 0.07431225699838251, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07431225699838251, "reward_after_std": 0.6487264335155487, "reward_before_mean": 0.7488601338118315, "reward_before_std": 0.5205418951809406, "reward_change_max": 0.0, "reward_change_mean": -0.6745478585362434, "reward_change_min": -1.0816974267363548, "reward_change_std": 0.4158578272908926, "reward_std": 0.6487264707684517, "rewards/cosine_scaled_reward": -0.042236629873514175, "rewards/format_reward": 0.8333333469927311, "step": 387 }, { "advantage_max": 1.7276557385921478, "advantage_mean": -1.1175871339474952e-08, "advantage_min": -0.5782233960926533, "advantage_std": 0.8705500811338425, "completion_length": 2406.3750762939453, "epoch": 0.44342857142857145, "grad_norm": 1.3232017755508423, "kl": 0.28704833984375, "lambda_div_used": 0.5, "learning_rate": 2.306931685585657e-07, "loss": -0.0104, "reward": 0.1922937948256731, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1922937948256731, "reward_after_std": 0.8705500811338425, "reward_before_mean": 0.8794347532093525, "reward_before_std": 0.6561367399990559, "reward_change_max": 0.0, "reward_change_mean": -0.6871409565210342, "reward_change_min": -1.0342309921979904, "reward_change_std": 0.3947345446795225, "reward_std": 0.870550125837326, "rewards/cosine_scaled_reward": 0.043884020298719406, "rewards/format_reward": 0.7916666828095913, "step": 388 }, { "advantage_max": 1.5205080583691597, "advantage_mean": -6.208817349140361e-09, "advantage_min": -0.6496899351477623, "advantage_std": 0.788145937025547, "completion_length": 2370.562530517578, "epoch": 0.44457142857142856, "grad_norm": 0.3369554281234741, "kl": 0.2927093505859375, "lambda_div_used": 0.5, "learning_rate": 2.2848729416523859e-07, "loss": 0.0241, "reward": -0.006450021639466286, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.006450021639466286, "reward_after_std": 0.7881459295749664, "reward_before_mean": 0.547332945279777, "reward_before_std": 0.6886685937643051, "reward_change_max": 0.0006812885403633118, "reward_change_mean": -0.5537829957902431, "reward_change_min": -0.9398513734340668, "reward_change_std": 0.37046326510608196, "reward_std": 0.788145937025547, "rewards/cosine_scaled_reward": -0.11175020085647702, "rewards/format_reward": 0.7708333414047956, "step": 389 }, { "advantage_max": 1.827544629573822, "advantage_mean": 1.241763458725842e-08, "advantage_min": -0.7890815921127796, "advantage_std": 0.9409858584403992, "completion_length": 2891.9375915527344, "epoch": 0.44571428571428573, "grad_norm": 0.9549670815467834, "kl": 0.42510986328125, "lambda_div_used": 0.5, "learning_rate": 2.2629708984760706e-07, "loss": 0.0339, "reward": 0.024634618312120438, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.024634618312120438, "reward_after_std": 0.9409858658909798, "reward_before_mean": 0.5577679611742496, "reward_before_std": 0.8641040101647377, "reward_change_max": 0.0, "reward_change_mean": -0.5331333354115486, "reward_change_min": -0.9533309638500214, "reward_change_std": 0.37772860564291477, "reward_std": 0.9409858882427216, "rewards/cosine_scaled_reward": -0.09611603221856058, "rewards/format_reward": 0.7500000186264515, "step": 390 }, { "advantage_max": 1.8195180594921112, "advantage_mean": -2.0489097307674342e-08, "advantage_min": -0.8196974471211433, "advantage_std": 0.9608958922326565, "completion_length": 2568.979232788086, "epoch": 0.44685714285714284, "grad_norm": 0.8521519899368286, "kl": 0.34222412109375, "lambda_div_used": 0.5, "learning_rate": 2.2412266235313973e-07, "loss": 0.053, "reward": 0.20352406054735184, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.20352406054735184, "reward_after_std": 0.9608958698809147, "reward_before_mean": 0.8823814336210489, "reward_before_std": 0.9071417227387428, "reward_change_max": 0.0, "reward_change_mean": -0.6788573786616325, "reward_change_min": -1.235791377723217, "reward_change_std": 0.48160158656537533, "reward_std": 0.9608958885073662, "rewards/cosine_scaled_reward": 0.055774035543436185, "rewards/format_reward": 0.7708333432674408, "step": 391 }, { "advantage_max": 1.5429241210222244, "advantage_mean": 5.58793583627093e-09, "advantage_min": -0.752219345420599, "advantage_std": 0.8274786025285721, "completion_length": 2352.1875610351562, "epoch": 0.448, "grad_norm": 0.43700075149536133, "kl": 0.310302734375, "lambda_div_used": 0.5, "learning_rate": 2.2196411766036487e-07, "loss": 0.0189, "reward": 0.17132935347035527, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17132935347035527, "reward_after_std": 0.8274786174297333, "reward_before_mean": 0.876284271478653, "reward_before_std": 0.7766976878046989, "reward_change_max": 0.0, "reward_change_mean": -0.7049549445509911, "reward_change_min": -1.256986565887928, "reward_change_std": 0.4841836094856262, "reward_std": 0.8274786546826363, "rewards/cosine_scaled_reward": -0.009774532169103622, "rewards/format_reward": 0.8958333507180214, "step": 392 }, { "advantage_max": 2.1338966339826584, "advantage_mean": -2.421438782818086e-08, "advantage_min": -1.005941316485405, "advantage_std": 1.1352594494819641, "completion_length": 2590.8333587646484, "epoch": 0.4491428571428571, "grad_norm": 0.5280225872993469, "kl": 0.28961181640625, "lambda_div_used": 0.5, "learning_rate": 2.1982156097370557e-07, "loss": 0.0103, "reward": 0.3125934284180403, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.3125934284180403, "reward_after_std": 1.1352594494819641, "reward_before_mean": 1.0306870639324188, "reward_before_std": 1.1202639937400818, "reward_change_max": 0.0010864585638046265, "reward_change_mean": -0.7180936299264431, "reward_change_min": -1.4779117703437805, "reward_change_std": 0.5617240853607655, "reward_std": 1.1352594941854477, "rewards/cosine_scaled_reward": 0.09867685753852129, "rewards/format_reward": 0.8333333469927311, "step": 393 }, { "advantage_max": 1.255016416311264, "advantage_mean": 6.829698917520943e-09, "advantage_min": -0.5600562617182732, "advantage_std": 0.656397633254528, "completion_length": 3040.604248046875, "epoch": 0.4502857142857143, "grad_norm": 0.9631423950195312, "kl": 0.3890380859375, "lambda_div_used": 0.5, "learning_rate": 2.1769509671835223e-07, "loss": 0.0085, "reward": -0.19103789888322353, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.19103789888322353, "reward_after_std": 0.656397633254528, "reward_before_mean": 0.25348040368407965, "reward_before_std": 0.5955763068050146, "reward_change_max": 0.0, "reward_change_mean": -0.4445183016359806, "reward_change_min": -0.8603153452277184, "reward_change_std": 0.32232517190277576, "reward_std": 0.6563976444303989, "rewards/cosine_scaled_reward": -0.22742647491395473, "rewards/format_reward": 0.7083333432674408, "step": 394 }, { "advantage_max": 1.6628983914852142, "advantage_mean": -9.313226023710541e-09, "advantage_min": -0.868357315659523, "advantage_std": 0.9002252444624901, "completion_length": 2143.541702270508, "epoch": 0.4514285714285714, "grad_norm": 0.5944837927818298, "kl": 0.2022247314453125, "lambda_div_used": 0.5, "learning_rate": 2.1558482853517253e-07, "loss": -0.0058, "reward": 0.07094830134883523, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.07094830134883523, "reward_after_std": 0.9002252593636513, "reward_before_mean": 0.6628658212721348, "reward_before_std": 0.9293262884020805, "reward_change_max": 0.00043542683124542236, "reward_change_mean": -0.5919175185263157, "reward_change_min": -1.1777884289622307, "reward_change_std": 0.48178502917289734, "reward_std": 0.9002252817153931, "rewards/cosine_scaled_reward": -0.043567102402448654, "rewards/format_reward": 0.7500000149011612, "step": 395 }, { "advantage_max": 1.619390420615673, "advantage_mean": -1.1175870895385742e-08, "advantage_min": -0.6089432537555695, "advantage_std": 0.8248656615614891, "completion_length": 2691.229232788086, "epoch": 0.45257142857142857, "grad_norm": 0.497490257024765, "kl": 0.25360107421875, "lambda_div_used": 0.5, "learning_rate": 2.134908592756607e-07, "loss": -0.0032, "reward": 0.21576578076928854, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.21576578076928854, "reward_after_std": 0.8248656615614891, "reward_before_mean": 0.9347635172307491, "reward_before_std": 0.6298792697489262, "reward_change_max": 0.0, "reward_change_mean": -0.7189977429807186, "reward_change_min": -1.1502107232809067, "reward_change_std": 0.42266157269477844, "reward_std": 0.824865709990263, "rewards/cosine_scaled_reward": 0.009048409294337034, "rewards/format_reward": 0.9166666865348816, "step": 396 }, { "advantage_max": 1.3563490435481071, "advantage_mean": -9.313225801665936e-09, "advantage_min": -0.6180124171078205, "advantage_std": 0.7036689929664135, "completion_length": 2026.4167175292969, "epoch": 0.45371428571428574, "grad_norm": 0.31323692202568054, "kl": 0.11944580078125, "lambda_div_used": 0.5, "learning_rate": 2.1141329099692406e-07, "loss": -0.0147, "reward": 0.08167102443985641, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08167102443985641, "reward_after_std": 0.7036689929664135, "reward_before_mean": 0.7375619001686573, "reward_before_std": 0.5679947603493929, "reward_change_max": 0.0009193271398544312, "reward_change_mean": -0.6558908764272928, "reward_change_min": -0.989147812128067, "reward_change_std": 0.3919084258377552, "reward_std": 0.7036690339446068, "rewards/cosine_scaled_reward": -0.027052395045757294, "rewards/format_reward": 0.7916666828095913, "step": 397 }, { "advantage_max": 1.7759979516267776, "advantage_mean": 7.45058065243498e-09, "advantage_min": -0.6430985629558563, "advantage_std": 0.9113304987549782, "completion_length": 2401.791717529297, "epoch": 0.45485714285714285, "grad_norm": 1.0113723278045654, "kl": 0.3887786865234375, "lambda_div_used": 0.5, "learning_rate": 2.0935222495670968e-07, "loss": 0.016, "reward": 0.09548487048596144, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.09548487048596144, "reward_after_std": 0.9113305136561394, "reward_before_mean": 0.6916496542398818, "reward_before_std": 0.7774554453790188, "reward_change_max": 0.0, "reward_change_mean": -0.5961647741496563, "reward_change_min": -1.03419828414917, "reward_change_std": 0.38981775380671024, "reward_std": 0.9113305732607841, "rewards/cosine_scaled_reward": -0.018758506514132023, "rewards/format_reward": 0.7291666828095913, "step": 398 }, { "advantage_max": 2.001268118619919, "advantage_mean": -1.924733378233512e-08, "advantage_min": -0.8032764531672001, "advantage_std": 1.0470364093780518, "completion_length": 2351.2500762939453, "epoch": 0.456, "grad_norm": 0.7004992365837097, "kl": 0.22613525390625, "lambda_div_used": 0.5, "learning_rate": 2.0730776160846853e-07, "loss": 0.0115, "reward": 0.30928437784314156, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.30928437784314156, "reward_after_std": 1.047036424279213, "reward_before_mean": 1.0472409576177597, "reward_before_std": 0.9616851769387722, "reward_change_max": 0.0, "reward_change_mean": -0.7379565685987473, "reward_change_min": -1.4024077132344246, "reward_change_std": 0.5100381188094616, "reward_std": 1.0470364391803741, "rewards/cosine_scaled_reward": 0.05487045622430742, "rewards/format_reward": 0.9375000074505806, "step": 399 }, { "advantage_max": 1.948826715350151, "advantage_mean": -1.6142924996742636e-08, "advantage_min": -0.8613158576190472, "advantage_std": 1.0191247761249542, "completion_length": 1702.4584121704102, "epoch": 0.45714285714285713, "grad_norm": 0.3669349253177643, "kl": 0.1160430908203125, "lambda_div_used": 0.5, "learning_rate": 2.0528000059645995e-07, "loss": -0.0115, "reward": 0.41814972274005413, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.41814972274005413, "reward_after_std": 1.0191247910261154, "reward_before_mean": 1.2584092626348138, "reward_before_std": 0.8928911443799734, "reward_change_max": 0.0, "reward_change_mean": -0.8402595855295658, "reward_change_min": -1.408696487545967, "reward_change_std": 0.545951347798109, "reward_std": 1.019124835729599, "rewards/cosine_scaled_reward": 0.1917046275921166, "rewards/format_reward": 0.8750000037252903, "step": 400 }, { "advantage_max": 1.6143862754106522, "advantage_mean": -1.3659397724019584e-08, "advantage_min": -0.7051677592098713, "advantage_std": 0.8424163311719894, "completion_length": 2862.0209350585938, "epoch": 0.4582857142857143, "grad_norm": 0.35205700993537903, "kl": 0.2542724609375, "lambda_div_used": 0.5, "learning_rate": 2.032690407508949e-07, "loss": 0.0116, "reward": 0.2912533753551543, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.2912533753551543, "reward_after_std": 0.8424163162708282, "reward_before_mean": 1.0814423598349094, "reward_before_std": 0.6909527480602264, "reward_change_max": 0.0003457888960838318, "reward_change_mean": -0.7901889868080616, "reward_change_min": -1.2221315279603004, "reward_change_std": 0.4830316975712776, "reward_std": 0.8424163609743118, "rewards/cosine_scaled_reward": 0.10322117432951927, "rewards/format_reward": 0.8750000074505806, "step": 401 }, { "advantage_max": 1.3721887990832329, "advantage_mean": 1.1796752963366686e-08, "advantage_min": -0.639917079359293, "advantage_std": 0.7129794172942638, "completion_length": 2224.3333740234375, "epoch": 0.4594285714285714, "grad_norm": 0.7739508748054504, "kl": 0.1800537109375, "lambda_div_used": 0.5, "learning_rate": 2.0127498008311922e-07, "loss": 0.0708, "reward": 0.03190509416162968, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.03190509416162968, "reward_after_std": 0.7129794359207153, "reward_before_mean": 0.6415856056846678, "reward_before_std": 0.6075125262141228, "reward_change_max": 0.0, "reward_change_mean": -0.609680525958538, "reward_change_min": -0.9582468569278717, "reward_change_std": 0.382707916200161, "reward_std": 0.7129794433712959, "rewards/cosine_scaled_reward": -0.12712387926876545, "rewards/format_reward": 0.8958333432674408, "step": 402 }, { "advantage_max": 1.3863247409462929, "advantage_mean": -1.8626450382086546e-09, "advantage_min": -0.633481714874506, "advantage_std": 0.7379178702831268, "completion_length": 2153.1042251586914, "epoch": 0.4605714285714286, "grad_norm": 0.45951032638549805, "kl": 0.164215087890625, "lambda_div_used": 0.5, "learning_rate": 1.9929791578083655e-07, "loss": -0.0093, "reward": 0.006359277293086052, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.006359277293086052, "reward_after_std": 0.7379178553819656, "reward_before_mean": 0.5869014756754041, "reward_before_std": 0.687812514603138, "reward_change_max": 0.0007327944040298462, "reward_change_mean": -0.5805421750992537, "reward_change_min": -1.0237832926213741, "reward_change_std": 0.4055717270821333, "reward_std": 0.7379178702831268, "rewards/cosine_scaled_reward": -0.07113261707127094, "rewards/format_reward": 0.7291666697710752, "step": 403 }, { "advantage_max": 1.2565838545560837, "advantage_mean": 1.2417634698280722e-09, "advantage_min": -0.6241967566311359, "advantage_std": 0.6679689809679985, "completion_length": 2440.2917404174805, "epoch": 0.4617142857142857, "grad_norm": 0.26222601532936096, "kl": 0.2291107177734375, "lambda_div_used": 0.5, "learning_rate": 1.9733794420337213e-07, "loss": 0.0283, "reward": -0.009857988567091525, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.009857988567091525, "reward_after_std": 0.6679689884185791, "reward_before_mean": 0.5863540228456259, "reward_before_std": 0.5875301137566566, "reward_change_max": 0.002009287476539612, "reward_change_mean": -0.59621203225106, "reward_change_min": -1.0139516070485115, "reward_change_std": 0.3946251608431339, "reward_std": 0.66796899959445, "rewards/cosine_scaled_reward": -0.06098964437842369, "rewards/format_reward": 0.708333345130086, "step": 404 }, { "advantage_max": 1.655549019575119, "advantage_mean": 6.208817460162663e-09, "advantage_min": -0.7989021204411983, "advantage_std": 0.8907373733818531, "completion_length": 2022.0833892822266, "epoch": 0.46285714285714286, "grad_norm": 0.33377718925476074, "kl": 0.201019287109375, "lambda_div_used": 0.5, "learning_rate": 1.9539516087697517e-07, "loss": 0.0165, "reward": 0.1627146191895008, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1627146191895008, "reward_after_std": 0.8907373733818531, "reward_before_mean": 0.8349231742322445, "reward_before_std": 0.8655808642506599, "reward_change_max": 0.0005633682012557983, "reward_change_mean": -0.6722085531800985, "reward_change_min": -1.2954475656151772, "reward_change_std": 0.4969491269439459, "reward_std": 0.8907373957335949, "rewards/cosine_scaled_reward": 0.0528782494366169, "rewards/format_reward": 0.7291666828095913, "step": 405 }, { "advantage_max": 1.8264697641134262, "advantage_mean": -4.967053546245381e-09, "advantage_min": -0.7284509651362896, "advantage_std": 0.9489434212446213, "completion_length": 2138.2083587646484, "epoch": 0.464, "grad_norm": 0.3909973204135895, "kl": 0.17059326171875, "lambda_div_used": 0.5, "learning_rate": 1.934696604901642e-07, "loss": -0.003, "reward": 0.1820035995915532, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1820035995915532, "reward_after_std": 0.9489434361457825, "reward_before_mean": 0.844778798520565, "reward_before_std": 0.8595172390341759, "reward_change_max": 0.0005043521523475647, "reward_change_mean": -0.6627751663327217, "reward_change_min": -1.2455066293478012, "reward_change_std": 0.4535139240324497, "reward_std": 0.9489434435963631, "rewards/cosine_scaled_reward": 0.0057227155193686485, "rewards/format_reward": 0.8333333395421505, "step": 406 }, { "advantage_max": 1.3399841859936714, "advantage_mean": -1.6763806731656672e-08, "advantage_min": -0.43382854759693146, "advantage_std": 0.6706324480473995, "completion_length": 2403.0208892822266, "epoch": 0.46514285714285714, "grad_norm": 0.22056791186332703, "kl": 0.1808319091796875, "lambda_div_used": 0.5, "learning_rate": 1.915615368891117e-07, "loss": 0.0168, "reward": 0.16278862720355392, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16278862720355392, "reward_after_std": 0.670632466673851, "reward_before_mean": 0.8940027691423893, "reward_before_std": 0.399973401799798, "reward_change_max": 0.0, "reward_change_mean": -0.7312141358852386, "reward_change_min": -1.0532505437731743, "reward_change_std": 0.3912728149443865, "reward_std": 0.6706324815750122, "rewards/cosine_scaled_reward": 0.040751357562839985, "rewards/format_reward": 0.8125000055879354, "step": 407 }, { "advantage_max": 1.7584019675850868, "advantage_mean": -2.85605594174676e-08, "advantage_min": -0.6405363604426384, "advantage_std": 0.9045485965907574, "completion_length": 2371.291748046875, "epoch": 0.4662857142857143, "grad_norm": 0.550900936126709, "kl": 0.2185821533203125, "lambda_div_used": 0.5, "learning_rate": 1.8967088307307e-07, "loss": 0.0123, "reward": 0.32463838160037994, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.32463838160037994, "reward_after_std": 0.9045485965907574, "reward_before_mean": 1.120386364404112, "reward_before_std": 0.6943012624979019, "reward_change_max": 0.0008317306637763977, "reward_change_mean": -0.7957480065524578, "reward_change_min": -1.2995992079377174, "reward_change_std": 0.5055977776646614, "reward_std": 0.9045485965907574, "rewards/cosine_scaled_reward": 0.15394318150356412, "rewards/format_reward": 0.8125000186264515, "step": 408 }, { "advantage_max": 1.32723917812109, "advantage_mean": 1.4901162082026076e-08, "advantage_min": -0.6812616810202599, "advantage_std": 0.705879982560873, "completion_length": 3048.041717529297, "epoch": 0.4674285714285714, "grad_norm": 0.5131399631500244, "kl": 0.278533935546875, "lambda_div_used": 0.5, "learning_rate": 1.8779779118983867e-07, "loss": 0.0211, "reward": -0.08907385356724262, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.08907385356724262, "reward_after_std": 0.705879982560873, "reward_before_mean": 0.43432591343298554, "reward_before_std": 0.6671581901609898, "reward_change_max": 0.0005308240652084351, "reward_change_mean": -0.5233997739851475, "reward_change_min": -0.9856052212417126, "reward_change_std": 0.38404569029808044, "reward_std": 0.7058799862861633, "rewards/cosine_scaled_reward": -0.09533703187480569, "rewards/format_reward": 0.6250000186264515, "step": 409 }, { "advantage_max": 1.9289898574352264, "advantage_mean": -2.483527050678447e-09, "advantage_min": -0.6834102421998978, "advantage_std": 0.9876251555979252, "completion_length": 2450.3750381469727, "epoch": 0.4685714285714286, "grad_norm": 1.2392657995224, "kl": 0.25274658203125, "lambda_div_used": 0.5, "learning_rate": 1.8594235253127372e-07, "loss": 0.0502, "reward": 0.04491107352077961, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.04491107352077961, "reward_after_std": 0.987625103443861, "reward_before_mean": 0.5722492169588804, "reward_before_std": 0.8849713280797005, "reward_change_max": 0.0, "reward_change_mean": -0.5273381508886814, "reward_change_min": -1.0481892675161362, "reward_change_std": 0.380878571420908, "reward_std": 0.9876251295208931, "rewards/cosine_scaled_reward": -0.06804206012748182, "rewards/format_reward": 0.7083333358168602, "step": 410 }, { "advantage_max": 1.8131102174520493, "advantage_mean": -4.346172532976311e-09, "advantage_min": -0.8035260625183582, "advantage_std": 0.9476320967078209, "completion_length": 2871.729278564453, "epoch": 0.4697142857142857, "grad_norm": 0.8493767976760864, "kl": 0.27581787109375, "lambda_div_used": 0.5, "learning_rate": 1.8410465752883758e-07, "loss": 0.0695, "reward": 0.2603566190227866, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2603566190227866, "reward_after_std": 0.9476321116089821, "reward_before_mean": 0.9874628521502018, "reward_before_std": 0.8463159576058388, "reward_change_max": 0.0006547495722770691, "reward_change_mean": -0.7271062415093184, "reward_change_min": -1.2260434813797474, "reward_change_std": 0.4771372377872467, "reward_std": 0.9476321414113045, "rewards/cosine_scaled_reward": 0.08748140814714134, "rewards/format_reward": 0.8125000111758709, "step": 411 }, { "advantage_max": 1.9703435078263283, "advantage_mean": -3.7252904094842165e-09, "advantage_min": -0.9085872285068035, "advantage_std": 1.0412596613168716, "completion_length": 2834.0208892822266, "epoch": 0.47085714285714286, "grad_norm": 1.3024187088012695, "kl": 0.22625732421875, "lambda_div_used": 0.5, "learning_rate": 1.822847957491922e-07, "loss": 0.0245, "reward": 0.18393388949334621, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.18393388949334621, "reward_after_std": 1.041259691119194, "reward_before_mean": 0.8306164983659983, "reward_before_std": 1.0156190879642963, "reward_change_max": 0.0005686357617378235, "reward_change_mean": -0.6466826163232327, "reward_change_min": -1.296832486987114, "reward_change_std": 0.5055333897471428, "reward_std": 1.0412597358226776, "rewards/cosine_scaled_reward": 0.04030823614448309, "rewards/format_reward": 0.7500000074505806, "step": 412 }, { "advantage_max": 1.5172990262508392, "advantage_mean": -3.104408619059029e-09, "advantage_min": -0.7004434801638126, "advantage_std": 0.803725078701973, "completion_length": 2530.0834197998047, "epoch": 0.472, "grad_norm": 0.7498294115066528, "kl": 0.19140625, "lambda_div_used": 0.5, "learning_rate": 1.804828558898332e-07, "loss": 0.0558, "reward": 0.10242907330393791, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.10242907330393791, "reward_after_std": 0.803725078701973, "reward_before_mean": 0.7471863450482488, "reward_before_std": 0.7387400958687067, "reward_change_max": 0.0, "reward_change_mean": -0.6447572745382786, "reward_change_min": -1.173839956521988, "reward_change_std": 0.4428400434553623, "reward_std": 0.8037250824272633, "rewards/cosine_scaled_reward": -0.011823497712612152, "rewards/format_reward": 0.7708333544433117, "step": 413 }, { "advantage_max": 1.4408729001879692, "advantage_mean": 1.8626451825376478e-08, "advantage_min": -0.597054660320282, "advantage_std": 0.7476470805704594, "completion_length": 3157.0625610351562, "epoch": 0.47314285714285714, "grad_norm": 0.9261155128479004, "kl": 0.346923828125, "lambda_div_used": 0.5, "learning_rate": 1.7869892577476722e-07, "loss": 0.0193, "reward": -0.11320638004690409, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.11320638004690409, "reward_after_std": 0.74764708802104, "reward_before_mean": 0.3687801326159388, "reward_before_std": 0.6687690652906895, "reward_change_max": 0.0027580782771110535, "reward_change_mean": -0.48198647797107697, "reward_change_min": -0.8558552041649818, "reward_change_std": 0.324984148144722, "reward_std": 0.747647114098072, "rewards/cosine_scaled_reward": -0.21144328452646732, "rewards/format_reward": 0.791666679084301, "step": 414 }, { "advantage_max": 1.704764910042286, "advantage_mean": -7.450580929990736e-09, "advantage_min": -0.9032006934285164, "advantage_std": 0.9267212748527527, "completion_length": 3225.979278564453, "epoch": 0.4742857142857143, "grad_norm": 0.5711144208908081, "kl": 0.39208984375, "lambda_div_used": 0.5, "learning_rate": 1.7693309235023127e-07, "loss": 0.0613, "reward": 0.13933263439685106, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13933263439685106, "reward_after_std": 0.9267212599515915, "reward_before_mean": 0.7884473074227571, "reward_before_std": 0.9520576298236847, "reward_change_max": 0.0004018843173980713, "reward_change_mean": -0.6491146758198738, "reward_change_min": -1.266348458826542, "reward_change_std": 0.5081395395100117, "reward_std": 0.9267212674021721, "rewards/cosine_scaled_reward": 0.019223633222281933, "rewards/format_reward": 0.7500000298023224, "step": 415 }, { "advantage_max": 1.9375486299395561, "advantage_mean": -3.1044085080367267e-09, "advantage_min": -0.8114347271621227, "advantage_std": 1.0063435733318329, "completion_length": 2236.8958740234375, "epoch": 0.4754285714285714, "grad_norm": 0.6455732583999634, "kl": 0.1776275634765625, "lambda_div_used": 0.5, "learning_rate": 1.7518544168045524e-07, "loss": 0.0696, "reward": 0.17256421316415071, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17256421316415071, "reward_after_std": 1.0063435807824135, "reward_before_mean": 0.8131153769791126, "reward_before_std": 0.9146289005875587, "reward_change_max": 0.0, "reward_change_mean": -0.6405511423945427, "reward_change_min": -1.2199120596051216, "reward_change_std": 0.4531344957649708, "reward_std": 1.0063436180353165, "rewards/cosine_scaled_reward": 0.0003076721914112568, "rewards/format_reward": 0.8125000055879354, "step": 416 }, { "advantage_max": 1.4362558871507645, "advantage_mean": 1.6142925107764938e-08, "advantage_min": -0.7358811981976032, "advantage_std": 0.7816518843173981, "completion_length": 3103.854248046875, "epoch": 0.4765714285714286, "grad_norm": 0.5906895399093628, "kl": 0.3399658203125, "lambda_div_used": 0.5, "learning_rate": 1.7345605894346726e-07, "loss": 0.0153, "reward": -0.08510342193767428, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.08510342193767428, "reward_after_std": 0.7816518843173981, "reward_before_mean": 0.4210092220455408, "reward_before_std": 0.8183082491159439, "reward_change_max": 0.0, "reward_change_mean": -0.5061126090586185, "reward_change_min": -1.0609179846942425, "reward_change_std": 0.4223988838493824, "reward_std": 0.7816519141197205, "rewards/cosine_scaled_reward": -0.11241207923740149, "rewards/format_reward": 0.6458333600312471, "step": 417 }, { "advantage_max": 1.923146240413189, "advantage_mean": -1.8316011041186187e-08, "advantage_min": -0.8123641051352024, "advantage_std": 0.9936596788465977, "completion_length": 2348.250072479248, "epoch": 0.4777142857142857, "grad_norm": 0.48226863145828247, "kl": 0.217620849609375, "lambda_div_used": 0.5, "learning_rate": 1.7174502842694212e-07, "loss": 0.0215, "reward": 0.39290976664051414, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.39290976664051414, "reward_after_std": 0.9936597011983395, "reward_before_mean": 1.216740008443594, "reward_before_std": 0.8273234628140926, "reward_change_max": 0.0, "reward_change_mean": -0.8238302320241928, "reward_change_min": -1.355735719203949, "reward_change_std": 0.5216735042631626, "reward_std": 0.9936597235500813, "rewards/cosine_scaled_reward": 0.17086999164894223, "rewards/format_reward": 0.8750000074505806, "step": 418 }, { "advantage_max": 1.910450629889965, "advantage_mean": -1.738468857759301e-08, "advantage_min": -0.8536994196474552, "advantage_std": 1.012555181980133, "completion_length": 2614.666748046875, "epoch": 0.47885714285714287, "grad_norm": 0.873965859413147, "kl": 0.208770751953125, "lambda_div_used": 0.5, "learning_rate": 1.7005243352409333e-07, "loss": 0.0198, "reward": 0.08720649965107441, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.08720649965107441, "reward_after_std": 1.012555181980133, "reward_before_mean": 0.6550634186714888, "reward_before_std": 1.015907321125269, "reward_change_max": 0.0020074471831321716, "reward_change_mean": -0.5678569041192532, "reward_change_min": -1.0999210849404335, "reward_change_std": 0.4567646738141775, "reward_std": 1.012555219233036, "rewards/cosine_scaled_reward": -0.037051646038889885, "rewards/format_reward": 0.7291666828095913, "step": 419 }, { "advantage_max": 1.4565354362130165, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.7246251739561558, "advantage_std": 0.7659836560487747, "completion_length": 2339.833381652832, "epoch": 0.48, "grad_norm": 0.440544456243515, "kl": 0.2689208984375, "lambda_div_used": 0.5, "learning_rate": 1.6837835672960831e-07, "loss": 0.0333, "reward": 0.05484509962843731, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.05484509962843731, "reward_after_std": 0.7659836485981941, "reward_before_mean": 0.6696557952091098, "reward_before_std": 0.6934031620621681, "reward_change_max": 0.0011351853609085083, "reward_change_mean": -0.6148106604814529, "reward_change_min": -1.0451572611927986, "reward_change_std": 0.4126305319368839, "reward_std": 0.7659836895763874, "rewards/cosine_scaled_reward": -0.0818388033658266, "rewards/format_reward": 0.833333358168602, "step": 420 }, { "advantage_max": 1.380041942000389, "advantage_mean": 4.346172144398253e-09, "advantage_min": -0.6508830934762955, "advantage_std": 0.7286509647965431, "completion_length": 3022.604248046875, "epoch": 0.48114285714285715, "grad_norm": 0.36839476227760315, "kl": 0.2734375, "lambda_div_used": 0.5, "learning_rate": 1.6672287963562852e-07, "loss": 0.0201, "reward": -0.10270002530887723, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10270002530887723, "reward_after_std": 0.7286509685218334, "reward_before_mean": 0.40078302239999175, "reward_before_std": 0.6932071186602116, "reward_change_max": 0.0006034299731254578, "reward_change_mean": -0.5034830346703529, "reward_change_min": -0.9662024602293968, "reward_change_std": 0.36624561436474323, "reward_std": 0.728650975972414, "rewards/cosine_scaled_reward": -0.18502516951411963, "rewards/format_reward": 0.7708333544433117, "step": 421 }, { "advantage_max": 1.4664915353059769, "advantage_mean": -1.2417635808503746e-09, "advantage_min": -0.6892965100705624, "advantage_std": 0.7829396314918995, "completion_length": 2686.6042098999023, "epoch": 0.48228571428571426, "grad_norm": 0.6950385570526123, "kl": 0.24615478515625, "lambda_div_used": 0.5, "learning_rate": 1.6508608292777203e-07, "loss": -0.0204, "reward": -0.0029990673065185547, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.0029990673065185547, "reward_after_std": 0.782939612865448, "reward_before_mean": 0.5685033015906811, "reward_before_std": 0.7512042485177517, "reward_change_max": 0.0011105537414550781, "reward_change_mean": -0.5715023390948772, "reward_change_min": -1.053415346890688, "reward_change_std": 0.41840689815580845, "reward_std": 0.7829396203160286, "rewards/cosine_scaled_reward": -0.1115816955716582, "rewards/format_reward": 0.7916666902601719, "step": 422 }, { "advantage_max": 1.5644587278366089, "advantage_mean": 6.829699028543246e-09, "advantage_min": -0.6998874768614769, "advantage_std": 0.8275195769965649, "completion_length": 2256.145881652832, "epoch": 0.48342857142857143, "grad_norm": 0.4108107388019562, "kl": 0.2389373779296875, "lambda_div_used": 0.5, "learning_rate": 1.6346804638120098e-07, "loss": -0.0092, "reward": -0.020866421051323414, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.020866421051323414, "reward_after_std": 0.8275195844471455, "reward_before_mean": 0.5108129326254129, "reward_before_std": 0.8039413914084435, "reward_change_max": 0.0, "reward_change_mean": -0.5316793769598007, "reward_change_min": -1.0674263015389442, "reward_change_std": 0.40754328295588493, "reward_std": 0.8275196105241776, "rewards/cosine_scaled_reward": -0.0987602099776268, "rewards/format_reward": 0.7083333469927311, "step": 423 }, { "advantage_max": 1.4423941150307655, "advantage_mean": 8.071462442860167e-09, "advantage_min": -0.562180645763874, "advantage_std": 0.7390119582414627, "completion_length": 2725.2084045410156, "epoch": 0.4845714285714286, "grad_norm": 0.6571977734565735, "kl": 0.2581787109375, "lambda_div_used": 0.5, "learning_rate": 1.6186884885673413e-07, "loss": 0.0731, "reward": -0.05772208608686924, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.05772208608686924, "reward_after_std": 0.7390119433403015, "reward_before_mean": 0.4672268598806113, "reward_before_std": 0.6280698869377375, "reward_change_max": 0.0, "reward_change_mean": -0.5249489285051823, "reward_change_min": -0.8993471413850784, "reward_change_std": 0.3353810776025057, "reward_std": 0.7390119507908821, "rewards/cosine_scaled_reward": -0.13096991274505854, "rewards/format_reward": 0.7291666809469461, "step": 424 }, { "advantage_max": 1.8298065513372421, "advantage_mean": 4.967054101356894e-09, "advantage_min": -0.79290621727705, "advantage_std": 0.9439056888222694, "completion_length": 2086.1250534057617, "epoch": 0.4857142857142857, "grad_norm": 0.7611392140388489, "kl": 0.239532470703125, "lambda_div_used": 0.5, "learning_rate": 1.6028856829700258e-07, "loss": 0.0603, "reward": 0.516562958873692, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.516562958873692, "reward_after_std": 0.9439056888222694, "reward_before_mean": 1.4521526768803596, "reward_before_std": 0.7164515964686871, "reward_change_max": 0.00011243671178817749, "reward_change_mean": -0.935589674860239, "reward_change_min": -1.415509656071663, "reward_change_std": 0.5498905442655087, "reward_std": 0.9439057037234306, "rewards/cosine_scaled_reward": 0.33024298259988427, "rewards/format_reward": 0.7916666828095913, "step": 425 }, { "advantage_max": 1.8577021807432175, "advantage_mean": 9.313225912688239e-09, "advantage_min": -0.7165437117218971, "advantage_std": 0.9620238281786442, "completion_length": 1966.6458892822266, "epoch": 0.4868571428571429, "grad_norm": 0.23527927696704865, "kl": 0.141265869140625, "lambda_div_used": 0.5, "learning_rate": 1.5872728172265146e-07, "loss": 0.004, "reward": 0.2146987458691001, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.2146987458691001, "reward_after_std": 0.9620238430798054, "reward_before_mean": 0.8918719813227654, "reward_before_std": 0.8488549254834652, "reward_change_max": 0.0008971467614173889, "reward_change_mean": -0.6771732289344072, "reward_change_min": -1.1791602671146393, "reward_change_std": 0.4466224256902933, "reward_std": 0.962023850530386, "rewards/cosine_scaled_reward": 0.018852660432457924, "rewards/format_reward": 0.8541666679084301, "step": 426 }, { "advantage_max": 1.620961919426918, "advantage_mean": -5.58793539218172e-09, "advantage_min": -0.7730313017964363, "advantage_std": 0.8482746072113514, "completion_length": 2883.041748046875, "epoch": 0.488, "grad_norm": 0.36235061287879944, "kl": 0.2977294921875, "lambda_div_used": 0.5, "learning_rate": 1.5718506522858572e-07, "loss": 0.0417, "reward": 0.17976178135722876, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.17976178135722876, "reward_after_std": 0.8482745960354805, "reward_before_mean": 0.8755260724574327, "reward_before_std": 0.7476286813616753, "reward_change_max": 0.0008819624781608582, "reward_change_mean": -0.6957642920315266, "reward_change_min": -1.1062694638967514, "reward_change_std": 0.4417474362999201, "reward_std": 0.8482746034860611, "rewards/cosine_scaled_reward": 0.031513024296145886, "rewards/format_reward": 0.812500013038516, "step": 427 }, { "advantage_max": 1.90454863011837, "advantage_mean": -9.313225690643634e-09, "advantage_min": -0.8171779625117779, "advantage_std": 1.0046098306775093, "completion_length": 2616.3125610351562, "epoch": 0.48914285714285716, "grad_norm": 0.5639275312423706, "kl": 0.276397705078125, "lambda_div_used": 0.5, "learning_rate": 1.5566199398026147e-07, "loss": 0.0197, "reward": 0.23964783176779747, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23964783176779747, "reward_after_std": 1.0046098455786705, "reward_before_mean": 0.9390165340155363, "reward_before_std": 0.9415052011609077, "reward_change_max": 0.0, "reward_change_mean": -0.6993687264621258, "reward_change_min": -1.348164975643158, "reward_change_std": 0.4951324574649334, "reward_std": 1.0046098679304123, "rewards/cosine_scaled_reward": 0.04242492467164993, "rewards/format_reward": 0.8541666939854622, "step": 428 }, { "advantage_max": 1.7529038935899734, "advantage_mean": 1.1175871561519557e-08, "advantage_min": -0.6102720461785793, "advantage_std": 0.8972237780690193, "completion_length": 2280.5000762939453, "epoch": 0.49028571428571427, "grad_norm": 0.29874134063720703, "kl": 0.1969451904296875, "lambda_div_used": 0.5, "learning_rate": 1.5415814221002265e-07, "loss": 0.0362, "reward": 0.0367011446505785, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0367011446505785, "reward_after_std": 0.8972237929701805, "reward_before_mean": 0.5863402839750051, "reward_before_std": 0.7750564068555832, "reward_change_max": 0.0, "reward_change_mean": -0.549639143049717, "reward_change_min": -1.0111135095357895, "reward_change_std": 0.3604156728833914, "reward_std": 0.8972238451242447, "rewards/cosine_scaled_reward": -0.09224652778357267, "rewards/format_reward": 0.770833345130086, "step": 429 }, { "advantage_max": 1.4815399199724197, "advantage_mean": -8.692344288796505e-09, "advantage_min": -0.5698779486119747, "advantage_std": 0.7527001649141312, "completion_length": 2175.6459197998047, "epoch": 0.49142857142857144, "grad_norm": 0.29431992769241333, "kl": 0.1795654296875, "lambda_div_used": 0.5, "learning_rate": 1.5267358321348285e-07, "loss": 0.0208, "reward": 0.08262787573039532, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08262787573039532, "reward_after_std": 0.7527001649141312, "reward_before_mean": 0.7200266793370247, "reward_before_std": 0.5634596981108189, "reward_change_max": 0.0, "reward_change_mean": -0.6373988222330809, "reward_change_min": -0.9826225861907005, "reward_change_std": 0.38093653693795204, "reward_std": 0.7527001947164536, "rewards/cosine_scaled_reward": 0.016263334080576897, "rewards/format_reward": 0.6875000018626451, "step": 430 }, { "advantage_max": 1.316926158964634, "advantage_mean": 3.1044085080367267e-09, "advantage_min": -0.6289765909314156, "advantage_std": 0.7088558524847031, "completion_length": 2656.604202270508, "epoch": 0.49257142857142855, "grad_norm": 0.37576258182525635, "kl": 0.30914306640625, "lambda_div_used": 0.5, "learning_rate": 1.5120838934595337e-07, "loss": 0.0404, "reward": -0.039196502417325974, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.039196502417325974, "reward_after_std": 0.7088558524847031, "reward_before_mean": 0.5267140232026577, "reward_before_std": 0.6843018792569637, "reward_change_max": 0.0008359923958778381, "reward_change_mean": -0.5659105181694031, "reward_change_min": -1.0442478582262993, "reward_change_std": 0.4136344399303198, "reward_std": 0.7088558599352837, "rewards/cosine_scaled_reward": -0.12205966003239155, "rewards/format_reward": 0.7708333469927311, "step": 431 }, { "advantage_max": 1.4318501353263855, "advantage_mean": -1.179675312990014e-08, "advantage_min": -0.808821115642786, "advantage_std": 0.7790350094437599, "completion_length": 2498.979217529297, "epoch": 0.4937142857142857, "grad_norm": 0.3581421375274658, "kl": 0.3209228515625, "lambda_div_used": 0.5, "learning_rate": 1.4976263201891613e-07, "loss": 0.0232, "reward": -0.008112411946058273, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.008112411946058273, "reward_after_std": 0.7790350392460823, "reward_before_mean": 0.5635525565594435, "reward_before_std": 0.7991535924375057, "reward_change_max": 0.0005855560302734375, "reward_change_mean": -0.5716649480164051, "reward_change_min": -1.0400940477848053, "reward_change_std": 0.4370091240853071, "reward_std": 0.7790350429713726, "rewards/cosine_scaled_reward": -0.05155707709491253, "rewards/format_reward": 0.6666666939854622, "step": 432 }, { "advantage_max": 1.39846720546484, "advantage_mean": 1.2417631367611648e-09, "advantage_min": -0.6483557894825935, "advantage_std": 0.7378144934773445, "completion_length": 2703.9167098999023, "epoch": 0.4948571428571429, "grad_norm": 0.4320548176765442, "kl": 0.274993896484375, "lambda_div_used": 0.5, "learning_rate": 1.483363816965435e-07, "loss": 0.004, "reward": 0.14599608164280653, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.14599608164280653, "reward_after_std": 0.7378145009279251, "reward_before_mean": 0.8464012066833675, "reward_before_std": 0.6108515709638596, "reward_change_max": 0.0, "reward_change_mean": -0.70040512830019, "reward_change_min": -1.126664161682129, "reward_change_std": 0.4516938291490078, "reward_std": 0.7378145381808281, "rewards/cosine_scaled_reward": 0.03778391517698765, "rewards/format_reward": 0.7708333507180214, "step": 433 }, { "advantage_max": 1.0967141687870026, "advantage_mean": 1.3348957383918503e-08, "advantage_min": -0.5869733244180679, "advantage_std": 0.5921955332159996, "completion_length": 2893.7500610351562, "epoch": 0.496, "grad_norm": 0.4491467773914337, "kl": 0.31512451171875, "lambda_div_used": 0.5, "learning_rate": 1.469297078922642e-07, "loss": 0.023, "reward": -0.17803913820534945, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.17803913820534945, "reward_after_std": 0.5921955332159996, "reward_before_mean": 0.3083352339453995, "reward_before_std": 0.5735643208026886, "reward_change_max": 0.0010740086436271667, "reward_change_mean": -0.4863743484020233, "reward_change_min": -0.881128154695034, "reward_change_std": 0.34586321376264095, "reward_std": 0.5921955406665802, "rewards/cosine_scaled_reward": -0.23124905675649643, "rewards/format_reward": 0.770833358168602, "step": 434 }, { "advantage_max": 1.555536113679409, "advantage_mean": -9.934107758624577e-09, "advantage_min": -0.6525837108492851, "advantage_std": 0.8070865571498871, "completion_length": 2240.395866394043, "epoch": 0.49714285714285716, "grad_norm": 0.6314480900764465, "kl": 0.2317047119140625, "lambda_div_used": 0.5, "learning_rate": 1.4554267916537495e-07, "loss": 0.0486, "reward": 0.04277882166206837, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.04277882166206837, "reward_after_std": 0.8070865720510483, "reward_before_mean": 0.6320224069058895, "reward_before_std": 0.7188326716423035, "reward_change_max": 0.002196013927459717, "reward_change_mean": -0.5892436020076275, "reward_change_min": -1.0437349304556847, "reward_change_std": 0.39640600606799126, "reward_std": 0.8070865944027901, "rewards/cosine_scaled_reward": -0.09023878816515207, "rewards/format_reward": 0.8125000111758709, "step": 435 }, { "advantage_max": 1.803834691643715, "advantage_mean": -8.692344399818808e-09, "advantage_min": -0.9135415107011795, "advantage_std": 0.9677546098828316, "completion_length": 2222.687568664551, "epoch": 0.4982857142857143, "grad_norm": 0.7510201334953308, "kl": 0.2289886474609375, "lambda_div_used": 0.5, "learning_rate": 1.4417536311769885e-07, "loss": -0.0192, "reward": 0.2787493225187063, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2787493225187063, "reward_after_std": 0.967754602432251, "reward_before_mean": 1.0235793087631464, "reward_before_std": 0.9413213469088078, "reward_change_max": 0.001415349543094635, "reward_change_mean": -0.7448300048708916, "reward_change_min": -1.3619326502084732, "reward_change_std": 0.5367412976920605, "reward_std": 0.9677546247839928, "rewards/cosine_scaled_reward": 0.10553962551057339, "rewards/format_reward": 0.8125000111758709, "step": 436 }, { "advantage_max": 1.3153732120990753, "advantage_mean": -1.241763464276957e-08, "advantage_min": -0.5812518484890461, "advantage_std": 0.6810687147080898, "completion_length": 2788.8126220703125, "epoch": 0.49942857142857144, "grad_norm": 0.6309353709220886, "kl": 0.2413330078125, "lambda_div_used": 0.5, "learning_rate": 1.4282782639029128e-07, "loss": -0.0033, "reward": -0.02365095540881157, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.02365095540881157, "reward_after_std": 0.681068692356348, "reward_before_mean": 0.555051636591088, "reward_before_std": 0.5716553665697575, "reward_change_max": 0.0, "reward_change_mean": -0.5787026062607765, "reward_change_min": -0.9294457882642746, "reward_change_std": 0.3546513319015503, "reward_std": 0.6810687109827995, "rewards/cosine_scaled_reward": -0.14955753087997437, "rewards/format_reward": 0.854166679084301, "step": 437 }, { "advantage_max": 1.6681238859891891, "advantage_mean": 0.0, "advantage_min": -0.6987838484346867, "advantage_std": 0.8795321509242058, "completion_length": 2744.104248046875, "epoch": 0.5005714285714286, "grad_norm": 0.7771314382553101, "kl": 0.3111572265625, "lambda_div_used": 0.5, "learning_rate": 1.4150013466019114e-07, "loss": 0.035, "reward": 0.0005856994539499283, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0005856994539499283, "reward_after_std": 0.8795321434736252, "reward_before_mean": 0.5383487353101373, "reward_before_std": 0.8421776629984379, "reward_change_max": 0.0006900280714035034, "reward_change_mean": -0.5377630740404129, "reward_change_min": -1.0730514042079449, "reward_change_std": 0.4223514683544636, "reward_std": 0.87953220307827, "rewards/cosine_scaled_reward": -0.0745756197720766, "rewards/format_reward": 0.6875000093132257, "step": 438 }, { "advantage_max": 1.3797827512025833, "advantage_mean": -2.483526884144993e-08, "advantage_min": -0.6489520594477654, "advantage_std": 0.7218637317419052, "completion_length": 2213.416717529297, "epoch": 0.5017142857142857, "grad_norm": 0.3220473527908325, "kl": 0.212005615234375, "lambda_div_used": 0.5, "learning_rate": 1.4019235263722034e-07, "loss": 0.0013, "reward": -0.04800033336505294, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.04800033336505294, "reward_after_std": 0.7218637317419052, "reward_before_mean": 0.49656740692444146, "reward_before_std": 0.6511244103312492, "reward_change_max": 0.0005797073245048523, "reward_change_mean": -0.5445677675306797, "reward_change_min": -0.9406459517776966, "reward_change_std": 0.3685037661343813, "reward_std": 0.7218637466430664, "rewards/cosine_scaled_reward": -0.12671629822580144, "rewards/format_reward": 0.750000013038516, "step": 439 }, { "advantage_max": 1.1950260624289513, "advantage_mean": 1.1796752796833232e-08, "advantage_min": -0.5321464128792286, "advantage_std": 0.6297563761472702, "completion_length": 2962.5833892822266, "epoch": 0.5028571428571429, "grad_norm": 0.6554329991340637, "kl": 0.315338134765625, "lambda_div_used": 0.5, "learning_rate": 1.3890454406082956e-07, "loss": 0.0261, "reward": -0.10440824367105961, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.10440824367105961, "reward_after_std": 0.6297563761472702, "reward_before_mean": 0.42778288945555687, "reward_before_std": 0.5683407075703144, "reward_change_max": 0.0014156848192214966, "reward_change_mean": -0.5321911424398422, "reward_change_min": -0.9416075497865677, "reward_change_std": 0.35448674857616425, "reward_std": 0.6297563910484314, "rewards/cosine_scaled_reward": -0.18194188922643661, "rewards/format_reward": 0.7916666753590107, "step": 440 }, { "advantage_max": 1.675834745168686, "advantage_mean": -9.934107758624577e-09, "advantage_min": -0.8089264556765556, "advantage_std": 0.8974254056811333, "completion_length": 2361.8750915527344, "epoch": 0.504, "grad_norm": 0.7889370918273926, "kl": 0.27734375, "lambda_div_used": 0.5, "learning_rate": 1.3763677169699217e-07, "loss": 0.0561, "reward": 0.04729715920984745, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": 0.04729715920984745, "reward_after_std": 0.8974253907799721, "reward_before_mean": 0.6199708860367537, "reward_before_std": 0.8931392878293991, "reward_change_max": 0.0019818097352981567, "reward_change_mean": -0.5726737715303898, "reward_change_min": -1.117475550621748, "reward_change_std": 0.44693936966359615, "reward_std": 0.8974254131317139, "rewards/cosine_scaled_reward": -0.0650145672261715, "rewards/format_reward": 0.7500000223517418, "step": 441 }, { "advantage_max": 1.7124353647232056, "advantage_mean": -1.2417634753791873e-08, "advantage_min": -0.7255078367888927, "advantage_std": 0.90172154083848, "completion_length": 2513.104232788086, "epoch": 0.5051428571428571, "grad_norm": 0.49326351284980774, "kl": 0.217529296875, "lambda_div_used": 0.5, "learning_rate": 1.3638909733514452e-07, "loss": 0.0213, "reward": 0.16501678587519564, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.16501678587519564, "reward_after_std": 0.9017215259373188, "reward_before_mean": 0.8267796207219362, "reward_before_std": 0.8075233958661556, "reward_change_max": 0.0017507299780845642, "reward_change_mean": -0.6617628708481789, "reward_change_min": -1.2036407738924026, "reward_change_std": 0.4718140196055174, "reward_std": 0.9017215520143509, "rewards/cosine_scaled_reward": 0.03838980197906494, "rewards/format_reward": 0.7500000111758709, "step": 442 }, { "advantage_max": 1.2792986631393433, "advantage_mean": 1.4280279847511679e-08, "advantage_min": -0.6353005319833755, "advantage_std": 0.6750058308243752, "completion_length": 2782.416763305664, "epoch": 0.5062857142857143, "grad_norm": 0.4678073823451996, "kl": 0.3636474609375, "lambda_div_used": 0.5, "learning_rate": 1.351615817851748e-07, "loss": 0.0335, "reward": -0.15951000433415174, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.15951000433415174, "reward_after_std": 0.6750058382749557, "reward_before_mean": 0.30732602812349796, "reward_before_std": 0.6360847689211369, "reward_change_max": 0.001194782555103302, "reward_change_mean": -0.46683603525161743, "reward_change_min": -0.7657738365232944, "reward_change_std": 0.3201506529003382, "reward_std": 0.6750058494508266, "rewards/cosine_scaled_reward": -0.17967032874003053, "rewards/format_reward": 0.6666666883975267, "step": 443 }, { "advantage_max": 1.3639702945947647, "advantage_mean": -3.1044091186593903e-09, "advantage_min": -0.7017063722014427, "advantage_std": 0.7363520376384258, "completion_length": 2507.6459045410156, "epoch": 0.5074285714285715, "grad_norm": 0.3171975314617157, "kl": 0.2984619140625, "lambda_div_used": 0.5, "learning_rate": 1.3395428487445914e-07, "loss": 0.0351, "reward": -0.00795650389045477, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.00795650389045477, "reward_after_std": 0.7363520339131355, "reward_before_mean": 0.574550624784024, "reward_before_std": 0.7271179482340813, "reward_change_max": 0.0, "reward_change_mean": -0.5825071297585964, "reward_change_min": -1.123153805732727, "reward_change_std": 0.4219965375959873, "reward_std": 0.7363520637154579, "rewards/cosine_scaled_reward": -0.09814136102795601, "rewards/format_reward": 0.7708333544433117, "step": 444 }, { "advantage_max": 1.7807418704032898, "advantage_mean": -1.2417634698280722e-08, "advantage_min": -0.8873736076056957, "advantage_std": 0.9604065492749214, "completion_length": 2646.0209045410156, "epoch": 0.5085714285714286, "grad_norm": 0.555528461933136, "kl": 0.234771728515625, "lambda_div_used": 0.5, "learning_rate": 1.3276726544494571e-07, "loss": 0.006, "reward": 0.1469159945845604, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1469159945845604, "reward_after_std": 0.9604065716266632, "reward_before_mean": 0.7886058418080211, "reward_before_std": 0.974107701331377, "reward_change_max": 0.0008414238691329956, "reward_change_mean": -0.641689844429493, "reward_change_min": -1.2327545881271362, "reward_change_std": 0.49856342375278473, "reward_std": 0.9604065902531147, "rewards/cosine_scaled_reward": -0.0223637567833066, "rewards/format_reward": 0.8333333507180214, "step": 445 }, { "advantage_max": 1.600982904434204, "advantage_mean": 2.483526884144993e-09, "advantage_min": -0.7412622272968292, "advantage_std": 0.8439879864454269, "completion_length": 2478.3750915527344, "epoch": 0.5097142857142857, "grad_norm": 1.7437587976455688, "kl": 0.28363037109375, "lambda_div_used": 0.5, "learning_rate": 1.316005813502869e-07, "loss": 0.0567, "reward": 0.06024339620489627, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06024339620489627, "reward_after_std": 0.8439880236983299, "reward_before_mean": 0.6513098394498229, "reward_before_std": 0.7912632301449776, "reward_change_max": 0.0, "reward_change_mean": -0.5910664387047291, "reward_change_min": -1.0746823698282242, "reward_change_std": 0.4154123105108738, "reward_std": 0.8439880311489105, "rewards/cosine_scaled_reward": -0.04934508353471756, "rewards/format_reward": 0.7500000111758709, "step": 446 }, { "advantage_max": 1.665705218911171, "advantage_mean": 4.967053546245381e-09, "advantage_min": -0.7389098927378654, "advantage_std": 0.887108825147152, "completion_length": 2225.3334045410156, "epoch": 0.5108571428571429, "grad_norm": 0.531279444694519, "kl": 0.19537353515625, "lambda_div_used": 0.5, "learning_rate": 1.3045428945301953e-07, "loss": 0.0211, "reward": 0.08284095581620932, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08284095581620932, "reward_after_std": 0.8871088400483131, "reward_before_mean": 0.6888705305755138, "reward_before_std": 0.8519846573472023, "reward_change_max": 0.0006781443953514099, "reward_change_mean": -0.6060296073555946, "reward_change_min": -1.178616851568222, "reward_change_std": 0.4650719538331032, "reward_std": 0.8871088586747646, "rewards/cosine_scaled_reward": -0.040981391444802284, "rewards/format_reward": 0.7708333432674408, "step": 447 }, { "advantage_max": 1.5247330516576767, "advantage_mean": -1.2728075537982164e-08, "advantage_min": -0.6495399959385395, "advantage_std": 0.8035099133849144, "completion_length": 2038.2083740234375, "epoch": 0.512, "grad_norm": 0.3488284945487976, "kl": 0.244049072265625, "lambda_div_used": 0.5, "learning_rate": 1.2932844562179352e-07, "loss": 0.0037, "reward": 0.11860301904380322, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.11860301904380322, "reward_after_std": 0.803509958088398, "reward_before_mean": 0.7717763539403677, "reward_before_std": 0.7233231142163277, "reward_change_max": 0.0, "reward_change_mean": -0.6531733274459839, "reward_change_min": -1.1682813242077827, "reward_change_std": 0.4387435019016266, "reward_std": 0.8035099804401398, "rewards/cosine_scaled_reward": -0.02036183699965477, "rewards/format_reward": 0.8125000074505806, "step": 448 }, { "advantage_max": 1.645022451877594, "advantage_mean": -6.208817182606907e-09, "advantage_min": -0.6213822662830353, "advantage_std": 0.8313011080026627, "completion_length": 2265.312530517578, "epoch": 0.5131428571428571, "grad_norm": 0.40414363145828247, "kl": 0.171142578125, "lambda_div_used": 0.5, "learning_rate": 1.2822310472864885e-07, "loss": 0.0142, "reward": 0.1660279119387269, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1660279119387269, "reward_after_std": 0.8313010893762112, "reward_before_mean": 0.8451403537765145, "reward_before_std": 0.6286548934876919, "reward_change_max": 0.0, "reward_change_mean": -0.6791124641895294, "reward_change_min": -1.0571254119277, "reward_change_std": 0.3865590952336788, "reward_std": 0.8313011229038239, "rewards/cosine_scaled_reward": -0.01492983102798462, "rewards/format_reward": 0.8750000055879354, "step": 449 }, { "advantage_max": 1.3215680569410324, "advantage_mean": -1.7384688799637615e-08, "advantage_min": -0.6460866183042526, "advantage_std": 0.6979735009372234, "completion_length": 2327.875045776367, "epoch": 0.5142857142857142, "grad_norm": 0.342936247587204, "kl": 0.1763763427734375, "lambda_div_used": 0.5, "learning_rate": 1.2713832064634125e-07, "loss": 0.0263, "reward": 0.08694206736981869, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08694206736981869, "reward_after_std": 0.6979734934866428, "reward_before_mean": 0.7526519640232436, "reward_before_std": 0.5924910455942154, "reward_change_max": 0.00038858503103256226, "reward_change_mean": -0.665709912776947, "reward_change_min": -1.07553119212389, "reward_change_std": 0.42237728647887707, "reward_std": 0.6979735009372234, "rewards/cosine_scaled_reward": -0.029924023896455765, "rewards/format_reward": 0.8125000223517418, "step": 450 }, { "advantage_max": 1.5565338879823685, "advantage_mean": 8.381903282561609e-09, "advantage_min": -0.6266820058226585, "advantage_std": 0.8044048026204109, "completion_length": 2260.375030517578, "epoch": 0.5154285714285715, "grad_norm": 0.5136589407920837, "kl": 0.171142578125, "lambda_div_used": 0.5, "learning_rate": 1.260741462457165e-07, "loss": -0.0038, "reward": 0.22477071173489094, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22477071173489094, "reward_after_std": 0.8044047877192497, "reward_before_mean": 0.9543191902339458, "reward_before_std": 0.6328833512961864, "reward_change_max": 0.0, "reward_change_mean": -0.729548454284668, "reward_change_min": -1.1411267966032028, "reward_change_std": 0.44372114166617393, "reward_std": 0.8044048100709915, "rewards/cosine_scaled_reward": 0.09174292162060738, "rewards/format_reward": 0.7708333414047956, "step": 451 }, { "advantage_max": 1.6044269427657127, "advantage_mean": -1.2417629147165599e-09, "advantage_min": -0.6126443706452847, "advantage_std": 0.8330548368394375, "completion_length": 2954.604217529297, "epoch": 0.5165714285714286, "grad_norm": 0.5223681330680847, "kl": 0.2507781982421875, "lambda_div_used": 0.5, "learning_rate": 1.2503063339313356e-07, "loss": 0.0356, "reward": -0.0031112791039049625, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0031112791039049625, "reward_after_std": 0.8330548293888569, "reward_before_mean": 0.5448109768331051, "reward_before_std": 0.7421528398990631, "reward_change_max": 0.000568874180316925, "reward_change_mean": -0.547922252677381, "reward_change_min": -0.9984243176877499, "reward_change_std": 0.38183396589010954, "reward_std": 0.8330548629164696, "rewards/cosine_scaled_reward": 0.0015721451491117477, "rewards/format_reward": 0.5416666753590107, "step": 452 }, { "advantage_max": 1.6805711835622787, "advantage_mean": -7.450580818968433e-09, "advantage_min": -0.7923614680767059, "advantage_std": 0.8882839158177376, "completion_length": 2572.4792098999023, "epoch": 0.5177142857142857, "grad_norm": 0.5597829222679138, "kl": 0.3200225830078125, "lambda_div_used": 0.5, "learning_rate": 1.2400783294793668e-07, "loss": 0.0241, "reward": 0.1551264775916934, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1551264775916934, "reward_after_std": 0.8882838860154152, "reward_before_mean": 0.8060325691476464, "reward_before_std": 0.8305098079144955, "reward_change_max": 0.0, "reward_change_mean": -0.6509060971438885, "reward_change_min": -1.1471052765846252, "reward_change_std": 0.44953753612935543, "reward_std": 0.8882839009165764, "rewards/cosine_scaled_reward": 0.007182922679930925, "rewards/format_reward": 0.7916666828095913, "step": 453 }, { "advantage_max": 1.5762149766087532, "advantage_mean": -6.208817238118058e-09, "advantage_min": -0.7216840162873268, "advantage_std": 0.8445982038974762, "completion_length": 2720.687530517578, "epoch": 0.5188571428571429, "grad_norm": 0.554466187953949, "kl": 0.1890716552734375, "lambda_div_used": 0.5, "learning_rate": 1.2300579475997657e-07, "loss": 0.0008, "reward": -0.006341944914311171, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.006341944914311171, "reward_after_std": 0.8445982038974762, "reward_before_mean": 0.5473174899816513, "reward_before_std": 0.8355911895632744, "reward_change_max": 0.0, "reward_change_mean": -0.5536594353616238, "reward_change_min": -1.1242877095937729, "reward_change_std": 0.42637988552451134, "reward_std": 0.8445982374250889, "rewards/cosine_scaled_reward": -0.07009126897901297, "rewards/format_reward": 0.687500013038516, "step": 454 }, { "advantage_max": 1.4252081513404846, "advantage_mean": 1.7384688355548406e-08, "advantage_min": -0.6050524637103081, "advantage_std": 0.7403261326253414, "completion_length": 2962.2709045410156, "epoch": 0.52, "grad_norm": 0.8583621382713318, "kl": 0.329345703125, "lambda_div_used": 0.5, "learning_rate": 1.220245676671809e-07, "loss": 0.0206, "reward": -0.07628545328043401, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.07628545328043401, "reward_after_std": 0.7403261326253414, "reward_before_mean": 0.44156032614409924, "reward_before_std": 0.652463223785162, "reward_change_max": 0.0005906671285629272, "reward_change_mean": -0.517845768481493, "reward_change_min": -0.9065490663051605, "reward_change_std": 0.357125923037529, "reward_std": 0.7403261587023735, "rewards/cosine_scaled_reward": -0.15421984996646643, "rewards/format_reward": 0.7500000149011612, "step": 455 }, { "advantage_max": 1.9315543174743652, "advantage_mean": 6.829698862009792e-09, "advantage_min": -0.6773689016699791, "advantage_std": 0.979364488273859, "completion_length": 2906.0416984558105, "epoch": 0.5211428571428571, "grad_norm": 1.3658193349838257, "kl": 0.2467193603515625, "lambda_div_used": 0.5, "learning_rate": 1.2106419949317388e-07, "loss": 0.0767, "reward": 0.042131487280130386, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.042131487280130386, "reward_after_std": 0.9793644994497299, "reward_before_mean": 0.5710029774345458, "reward_before_std": 0.8479274027049541, "reward_change_max": 0.0, "reward_change_mean": -0.5288714915513992, "reward_change_min": -0.9232162311673164, "reward_change_std": 0.35106181912124157, "reward_std": 0.9793645069003105, "rewards/cosine_scaled_reward": -0.08949852362275124, "rewards/format_reward": 0.7500000111758709, "step": 456 }, { "advantage_max": 1.3629868924617767, "advantage_mean": 1.3038516655239363e-08, "advantage_min": -0.5921227112412453, "advantage_std": 0.7106032706797123, "completion_length": 2766.6458740234375, "epoch": 0.5222857142857142, "grad_norm": 0.4502284526824951, "kl": 0.3090972900390625, "lambda_div_used": 0.5, "learning_rate": 1.2012473704494537e-07, "loss": 0.0296, "reward": 0.027612894773483276, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.027612894773483276, "reward_after_std": 0.7106032818555832, "reward_before_mean": 0.6371276685968041, "reward_before_std": 0.605956356972456, "reward_change_max": 0.0, "reward_change_mean": -0.6095147393643856, "reward_change_min": -1.0139854177832603, "reward_change_std": 0.39748547598719597, "reward_std": 0.7106032893061638, "rewards/cosine_scaled_reward": 0.016480496153235435, "rewards/format_reward": 0.6041666753590107, "step": 457 }, { "advantage_max": 1.346614234149456, "advantage_mean": 1.428027990302283e-08, "advantage_min": -0.5400624051690102, "advantage_std": 0.7049011290073395, "completion_length": 2300.4791946411133, "epoch": 0.5234285714285715, "grad_norm": 0.29569682478904724, "kl": 0.207855224609375, "lambda_div_used": 0.5, "learning_rate": 1.1920622611056974e-07, "loss": 0.02, "reward": -0.03890596283599734, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.03890596283599734, "reward_after_std": 0.7049011215567589, "reward_before_mean": 0.517438106238842, "reward_before_std": 0.6150476150214672, "reward_change_max": 0.0, "reward_change_mean": -0.5563440751284361, "reward_change_min": -0.9435974843800068, "reward_change_std": 0.3535274900496006, "reward_std": 0.7049011215567589, "rewards/cosine_scaled_reward": -0.13711428828537464, "rewards/format_reward": 0.7916666679084301, "step": 458 }, { "advantage_max": 1.4628484919667244, "advantage_mean": -4.346171977864799e-09, "advantage_min": -0.65413873270154, "advantage_std": 0.7680129557847977, "completion_length": 2132.062545776367, "epoch": 0.5245714285714286, "grad_norm": 0.40495961904525757, "kl": 0.19858551025390625, "lambda_div_used": 0.5, "learning_rate": 1.1830871145697412e-07, "loss": 0.011, "reward": 0.19387144222855568, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.19387144222855568, "reward_after_std": 0.7680129408836365, "reward_before_mean": 0.9214918408542871, "reward_before_std": 0.6441125757992268, "reward_change_max": 0.0, "reward_change_mean": -0.7276204153895378, "reward_change_min": -1.1330279782414436, "reward_change_std": 0.4475998468697071, "reward_std": 0.7680129408836365, "rewards/cosine_scaled_reward": 0.03366258554160595, "rewards/format_reward": 0.8541666828095913, "step": 459 }, { "advantage_max": 1.3919820860028267, "advantage_mean": 4.6566129563441194e-09, "advantage_min": -0.6502477303147316, "advantage_std": 0.7406989708542824, "completion_length": 3168.9375610351562, "epoch": 0.5257142857142857, "grad_norm": 0.6914077401161194, "kl": 0.31787109375, "lambda_div_used": 0.5, "learning_rate": 1.1743223682775649e-07, "loss": 0.047, "reward": -0.1056380441877991, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1056380441877991, "reward_after_std": 0.7406989708542824, "reward_before_mean": 0.3909294670447707, "reward_before_std": 0.7229253053665161, "reward_change_max": 0.0012809410691261292, "reward_change_mean": -0.49656750820577145, "reward_change_min": -1.0154965370893478, "reward_change_std": 0.3839583992958069, "reward_std": 0.7406989932060242, "rewards/cosine_scaled_reward": -0.18995194137096405, "rewards/format_reward": 0.7708333544433117, "step": 460 }, { "advantage_max": 1.7813145220279694, "advantage_mean": -8.071462387349015e-09, "advantage_min": -0.9457485042512417, "advantage_std": 0.96094960719347, "completion_length": 2815.604263305664, "epoch": 0.5268571428571428, "grad_norm": 0.6786468625068665, "kl": 0.2430419921875, "lambda_div_used": 0.5, "learning_rate": 1.1657684494105386e-07, "loss": 0.0144, "reward": 0.20053629763424397, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.20053629763424397, "reward_after_std": 0.9609495922923088, "reward_before_mean": 0.8884876975789666, "reward_before_std": 0.9701001457870007, "reward_change_max": 0.0006676092743873596, "reward_change_mean": -0.6879513971507549, "reward_change_min": -1.2457620240747929, "reward_change_std": 0.516149502247572, "reward_std": 0.9609496183693409, "rewards/cosine_scaled_reward": 0.1004938306286931, "rewards/format_reward": 0.6875000111758709, "step": 461 }, { "advantage_max": 1.4213928058743477, "advantage_mean": 6.4222452911266714e-09, "advantage_min": -0.5725329555571079, "advantage_std": 0.7395026087760925, "completion_length": 2614.8125610351562, "epoch": 0.528, "grad_norm": 0.7102219462394714, "kl": 0.2266845703125, "lambda_div_used": 0.5, "learning_rate": 1.1574257748745986e-07, "loss": 0.0127, "reward": -0.033994670637184754, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.033994670637184754, "reward_after_std": 0.7395026162266731, "reward_before_mean": 0.5079695815220475, "reward_before_std": 0.6631190590560436, "reward_change_max": 0.0, "reward_change_mean": -0.5419642440974712, "reward_change_min": -0.986947949975729, "reward_change_std": 0.36053442023694515, "reward_std": 0.7395026385784149, "rewards/cosine_scaled_reward": -0.11059854784980416, "rewards/format_reward": 0.7291666734963655, "step": 462 }, { "advantage_max": 2.1029365062713623, "advantage_mean": 2.4835267176115394e-09, "advantage_min": -0.8385254740715027, "advantage_std": 1.095969557762146, "completion_length": 2805.3333740234375, "epoch": 0.5291428571428571, "grad_norm": 1.0408904552459717, "kl": 0.3365631103515625, "lambda_div_used": 0.5, "learning_rate": 1.1492947512799328e-07, "loss": 0.0304, "reward": 0.1863800287246704, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.1863800287246704, "reward_after_std": 1.0959695428609848, "reward_before_mean": 0.7977765635587275, "reward_before_std": 1.050209902226925, "reward_change_max": 0.0, "reward_change_mean": -0.6113965678960085, "reward_change_min": -1.2300319075584412, "reward_change_std": 0.4674637410789728, "reward_std": 1.0959695726633072, "rewards/cosine_scaled_reward": 0.055138289637397975, "rewards/format_reward": 0.6875000018626451, "step": 463 }, { "advantage_max": 1.4044927433133125, "advantage_mean": -8.692344233285354e-09, "advantage_min": -0.5210793204605579, "advantage_std": 0.7161996513605118, "completion_length": 1916.3125381469727, "epoch": 0.5302857142857142, "grad_norm": 0.8636742830276489, "kl": 0.178009033203125, "lambda_div_used": 0.5, "learning_rate": 1.1413757749211602e-07, "loss": 0.0443, "reward": 0.20765853859484196, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.20765853859484196, "reward_after_std": 0.7161996364593506, "reward_before_mean": 0.9627609495073557, "reward_before_std": 0.48609965573996305, "reward_change_max": 0.0006634891033172607, "reward_change_mean": -0.7551024369895458, "reward_change_min": -1.1348499171435833, "reward_change_std": 0.43385453149676323, "reward_std": 0.7161996439099312, "rewards/cosine_scaled_reward": 0.054297154769301414, "rewards/format_reward": 0.8541666772216558, "step": 464 }, { "advantage_max": 1.9507509917020798, "advantage_mean": -4.967053768289986e-09, "advantage_min": -0.887705996632576, "advantage_std": 1.0343635454773903, "completion_length": 2737.395881652832, "epoch": 0.5314285714285715, "grad_norm": 0.669562578201294, "kl": 0.25555419921875, "lambda_div_used": 0.5, "learning_rate": 1.1336692317580158e-07, "loss": 0.0124, "reward": 0.14993727079126984, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.14993727079126984, "reward_after_std": 1.0343635752797127, "reward_before_mean": 0.765050558373332, "reward_before_std": 1.0290489830076694, "reward_change_max": 0.0, "reward_change_mean": -0.6151133142411709, "reward_change_min": -1.1888877488672733, "reward_change_std": 0.47263885475695133, "reward_std": 1.0343635752797127, "rewards/cosine_scaled_reward": 0.007525268010795116, "rewards/format_reward": 0.7500000279396772, "step": 465 }, { "advantage_max": 1.6696715205907822, "advantage_mean": -2.2351742234860694e-08, "advantage_min": -0.6946392580866814, "advantage_std": 0.8632129430770874, "completion_length": 3091.541748046875, "epoch": 0.5325714285714286, "grad_norm": 1.2827321290969849, "kl": 0.28610992431640625, "lambda_div_used": 0.5, "learning_rate": 1.1261754973965422e-07, "loss": 0.0023, "reward": 0.3170170905068517, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3170170905068517, "reward_after_std": 0.8632129468023777, "reward_before_mean": 1.1178719718009233, "reward_before_std": 0.6960075739771128, "reward_change_max": 0.0, "reward_change_mean": -0.80085489153862, "reward_change_min": -1.2640929967164993, "reward_change_std": 0.4748579952865839, "reward_std": 0.8632129579782486, "rewards/cosine_scaled_reward": 0.1631026342511177, "rewards/format_reward": 0.7916666716337204, "step": 466 }, { "advantage_max": 1.4761824756860733, "advantage_mean": 1.3038516433194758e-08, "advantage_min": -0.6751852855086327, "advantage_std": 0.7853565439581871, "completion_length": 2753.9375762939453, "epoch": 0.5337142857142857, "grad_norm": 0.511631965637207, "kl": 0.2637786865234375, "lambda_div_used": 0.5, "learning_rate": 1.1188949370707787e-07, "loss": 0.0275, "reward": -0.0035535165516193956, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.0035535165516193956, "reward_after_std": 0.7853565439581871, "reward_before_mean": 0.5619046837091446, "reward_before_std": 0.744366992264986, "reward_change_max": 0.000514104962348938, "reward_change_mean": -0.5654581896960735, "reward_change_min": -1.037429817020893, "reward_change_std": 0.39547770842909813, "reward_std": 0.7853565737605095, "rewards/cosine_scaled_reward": -0.1565476767718792, "rewards/format_reward": 0.8750000223517418, "step": 467 }, { "advantage_max": 1.8538229018449783, "advantage_mean": -1.2417635808503746e-09, "advantage_min": -0.7242231853306293, "advantage_std": 0.9473971761763096, "completion_length": 2780.291732788086, "epoch": 0.5348571428571428, "grad_norm": 0.6883957386016846, "kl": 0.2642822265625, "lambda_div_used": 0.5, "learning_rate": 1.1118279056249653e-07, "loss": 0.0303, "reward": 0.10545292682945728, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.10545292682945728, "reward_after_std": 0.9473971761763096, "reward_before_mean": 0.6927674971520901, "reward_before_std": 0.8236076533794403, "reward_change_max": 0.0, "reward_change_mean": -0.5873145572841167, "reward_change_min": -1.057724367827177, "reward_change_std": 0.38849309273064137, "reward_std": 0.9473971910774708, "rewards/cosine_scaled_reward": -0.04944960871944204, "rewards/format_reward": 0.7916666865348816, "step": 468 }, { "advantage_max": 1.807606466114521, "advantage_mean": 0.0, "advantage_min": -0.7852521277964115, "advantage_std": 0.9635585993528366, "completion_length": 2637.2083740234375, "epoch": 0.536, "grad_norm": 0.83745276927948, "kl": 0.30572509765625, "lambda_div_used": 0.5, "learning_rate": 1.1049747474962444e-07, "loss": 0.0347, "reward": 0.08417265303432941, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.08417265303432941, "reward_after_std": 0.9635586142539978, "reward_before_mean": 0.6696641687303782, "reward_before_std": 0.9555434696376324, "reward_change_max": 0.00029350072145462036, "reward_change_mean": -0.5854915156960487, "reward_change_min": -1.1475455537438393, "reward_change_std": 0.4715314581990242, "reward_std": 0.9635586366057396, "rewards/cosine_scaled_reward": -0.04016792553011328, "rewards/format_reward": 0.7500000186264515, "step": 469 }, { "advantage_max": 1.3150828257203102, "advantage_mean": 1.2107193830823704e-08, "advantage_min": -0.686145231127739, "advantage_std": 0.7068404294550419, "completion_length": 3181.7084045410156, "epoch": 0.5371428571428571, "grad_norm": 0.4413515627384186, "kl": 0.29815673828125, "lambda_div_used": 0.5, "learning_rate": 1.0983357966978745e-07, "loss": 0.0485, "reward": -0.16447675041854382, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": -0.16447675041854382, "reward_after_std": 0.7068404443562031, "reward_before_mean": 0.2920903190970421, "reward_before_std": 0.7121127918362617, "reward_change_max": 0.00026867538690567017, "reward_change_mean": -0.45656704902648926, "reward_change_min": -0.8190466426312923, "reward_change_std": 0.34242652729153633, "reward_std": 0.7068404592573643, "rewards/cosine_scaled_reward": -0.17687150929123163, "rewards/format_reward": 0.6458333544433117, "step": 470 }, { "advantage_max": 1.77982447296381, "advantage_mean": 6.208816794028849e-10, "advantage_min": -0.6413669362664223, "advantage_std": 0.9082127101719379, "completion_length": 2911.2084045410156, "epoch": 0.5382857142857143, "grad_norm": 0.8663266897201538, "kl": 0.2624359130859375, "lambda_div_used": 0.5, "learning_rate": 1.0919113768029517e-07, "loss": 0.0105, "reward": 0.22175636049360037, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.22175636049360037, "reward_after_std": 0.9082127138972282, "reward_before_mean": 0.9290354289114475, "reward_before_std": 0.7207024209201336, "reward_change_max": 0.0, "reward_change_mean": -0.7072790637612343, "reward_change_min": -1.1375292390584946, "reward_change_std": 0.42616303265094757, "reward_std": 0.9082127138972282, "rewards/cosine_scaled_reward": 0.04785105166956782, "rewards/format_reward": 0.8333333469927311, "step": 471 }, { "advantage_max": 1.4702309519052505, "advantage_mean": -8.692344288796505e-09, "advantage_min": -0.6833783108741045, "advantage_std": 0.7934335358440876, "completion_length": 2542.3958892822266, "epoch": 0.5394285714285715, "grad_norm": 0.3770515024662018, "kl": 0.21100616455078125, "lambda_div_used": 0.5, "learning_rate": 1.0857018009286381e-07, "loss": -0.0088, "reward": -0.08202749770134687, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08202749770134687, "reward_after_std": 0.7934335470199585, "reward_before_mean": 0.42360311560332775, "reward_before_std": 0.8034615498036146, "reward_change_max": 0.0008336007595062256, "reward_change_mean": -0.5056306086480618, "reward_change_min": -0.9713359847664833, "reward_change_std": 0.4131198935210705, "reward_std": 0.7934335544705391, "rewards/cosine_scaled_reward": -0.15278178313747048, "rewards/format_reward": 0.7291666716337204, "step": 472 }, { "advantage_max": 1.4210163056850433, "advantage_mean": -8.692344621863413e-09, "advantage_min": -0.5511983409523964, "advantage_std": 0.7278570458292961, "completion_length": 2934.0000915527344, "epoch": 0.5405714285714286, "grad_norm": 0.3252032399177551, "kl": 0.234100341796875, "lambda_div_used": 0.5, "learning_rate": 1.0797073717209013e-07, "loss": 0.0438, "reward": -0.03442589659243822, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.03442589659243822, "reward_after_std": 0.7278570234775543, "reward_before_mean": 0.515663924627006, "reward_before_std": 0.5890849232673645, "reward_change_max": 0.0013192519545555115, "reward_change_mean": -0.5500898249447346, "reward_change_min": -0.8929934203624725, "reward_change_std": 0.34633047319948673, "reward_std": 0.7278570309281349, "rewards/cosine_scaled_reward": -0.08591805072501302, "rewards/format_reward": 0.6875000093132257, "step": 473 }, { "advantage_max": 1.601642407476902, "advantage_mean": -1.9868215517249155e-08, "advantage_min": -0.6131134703755379, "advantage_std": 0.8238486312329769, "completion_length": 2461.9375610351562, "epoch": 0.5417142857142857, "grad_norm": 0.7367488741874695, "kl": 0.2513580322265625, "lambda_div_used": 0.5, "learning_rate": 1.0739283813397639e-07, "loss": 0.0143, "reward": 0.23803382087498903, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.23803382087498903, "reward_after_std": 0.8238486312329769, "reward_before_mean": 0.9828812517225742, "reward_before_std": 0.628759897314012, "reward_change_max": 0.0, "reward_change_mean": -0.7448474448174238, "reward_change_min": -1.1187105029821396, "reward_change_std": 0.44198982790112495, "reward_std": 0.8238486833870411, "rewards/cosine_scaled_reward": 0.08519062399864197, "rewards/format_reward": 0.812500013038516, "step": 474 }, { "advantage_max": 2.2786176800727844, "advantage_mean": -8.6923440667519e-09, "advantage_min": -1.0382463559508324, "advantage_std": 1.20528145134449, "completion_length": 2290.6041870117188, "epoch": 0.5428571428571428, "grad_norm": 1.510539174079895, "kl": 0.19629669189453125, "lambda_div_used": 0.5, "learning_rate": 1.068365111445064e-07, "loss": 0.0649, "reward": 0.4062812924385071, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4062812924385071, "reward_after_std": 1.2052814662456512, "reward_before_mean": 1.1780878230929375, "reward_before_std": 1.165225274860859, "reward_change_max": 0.0019550248980522156, "reward_change_mean": -0.7718064785003662, "reward_change_min": -1.4727793186903, "reward_change_std": 0.5788527056574821, "reward_std": 1.2052814960479736, "rewards/cosine_scaled_reward": 0.18279388174414635, "rewards/format_reward": 0.8125000111758709, "step": 475 }, { "advantage_max": 2.106508746743202, "advantage_mean": -3.0112764060064023e-08, "advantage_min": -0.8679858073592186, "advantage_std": 1.1112488955259323, "completion_length": 2717.4375915527344, "epoch": 0.544, "grad_norm": 0.8656560182571411, "kl": 0.22894287109375, "lambda_div_used": 0.5, "learning_rate": 1.063017833182728e-07, "loss": 0.0287, "reward": 0.3367365933954716, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.3367365933954716, "reward_after_std": 1.1112489104270935, "reward_before_mean": 1.082529116421938, "reward_before_std": 1.0571173802018166, "reward_change_max": 0.0, "reward_change_mean": -0.7457925379276276, "reward_change_min": -1.5623594596982002, "reward_change_std": 0.5568713434040546, "reward_std": 1.111248940229416, "rewards/cosine_scaled_reward": 0.07251455070218071, "rewards/format_reward": 0.9375000149011612, "step": 476 }, { "advantage_max": 1.8266699612140656, "advantage_mean": 1.862645149230957e-09, "advantage_min": -0.6633548140525818, "advantage_std": 0.9389680698513985, "completion_length": 1962.6667251586914, "epoch": 0.5451428571428572, "grad_norm": 0.4617604613304138, "kl": 0.106781005859375, "lambda_div_used": 0.5, "learning_rate": 1.0578868071715544e-07, "loss": 0.0078, "reward": 0.46729777520522475, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.46729777520522475, "reward_after_std": 0.9389680847525597, "reward_before_mean": 1.3609278202056885, "reward_before_std": 0.6903122533112764, "reward_change_max": 0.0, "reward_change_mean": -0.8936300873756409, "reward_change_min": -1.3898151367902756, "reward_change_std": 0.518205638974905, "reward_std": 0.9389681071043015, "rewards/cosine_scaled_reward": 0.18046390381641686, "rewards/format_reward": 1.0, "step": 477 }, { "advantage_max": 1.443758599460125, "advantage_mean": 2.0489097030118586e-08, "advantage_min": -0.5808941088616848, "advantage_std": 0.7490430325269699, "completion_length": 2686.3125610351562, "epoch": 0.5462857142857143, "grad_norm": 0.5362579822540283, "kl": 0.19677734375, "lambda_div_used": 0.5, "learning_rate": 1.0529722834905125e-07, "loss": 0.0186, "reward": -0.016255690716207027, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.016255690716207027, "reward_after_std": 0.7490430511534214, "reward_before_mean": 0.5467281192541122, "reward_before_std": 0.6304172240197659, "reward_change_max": 0.0013319402933120728, "reward_change_mean": -0.5629837699234486, "reward_change_min": -0.9032267481088638, "reward_change_std": 0.3670722022652626, "reward_std": 0.749043058604002, "rewards/cosine_scaled_reward": -0.03913595899939537, "rewards/format_reward": 0.6250000055879354, "step": 478 }, { "advantage_max": 1.3233840316534042, "advantage_mean": 6.5192581055750765e-09, "advantage_min": -0.6653562523424625, "advantage_std": 0.7084184885025024, "completion_length": 2855.8750915527344, "epoch": 0.5474285714285714, "grad_norm": 0.9758108258247375, "kl": 0.2758941650390625, "lambda_div_used": 0.5, "learning_rate": 1.0482745016665526e-07, "loss": 0.0006, "reward": -0.041172572411596775, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.041172572411596775, "reward_after_std": 0.7084184885025024, "reward_before_mean": 0.5113696761545725, "reward_before_std": 0.6773869059979916, "reward_change_max": 0.0, "reward_change_mean": -0.5525422282516956, "reward_change_min": -1.0090350657701492, "reward_change_std": 0.3896159194409847, "reward_std": 0.708418533205986, "rewards/cosine_scaled_reward": -0.12973184324800968, "rewards/format_reward": 0.7708333469927311, "step": 479 }, { "advantage_max": 1.533621370792389, "advantage_mean": 6.208817571184966e-09, "advantage_min": -0.6178735308349133, "advantage_std": 0.8067186251282692, "completion_length": 2509.1250228881836, "epoch": 0.5485714285714286, "grad_norm": 0.46223902702331543, "kl": 0.26495361328125, "lambda_div_used": 0.5, "learning_rate": 1.0437936906629334e-07, "loss": 0.0208, "reward": -0.08323651552200317, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.08323651552200317, "reward_after_std": 0.8067186456173658, "reward_before_mean": 0.4114815816283226, "reward_before_std": 0.7638159282505512, "reward_change_max": 0.0004747062921524048, "reward_change_mean": -0.4947180775925517, "reward_change_min": -1.0002856254577637, "reward_change_std": 0.3739140098914504, "reward_std": 0.8067186698317528, "rewards/cosine_scaled_reward": -0.16925923340022564, "rewards/format_reward": 0.7500000074505806, "step": 480 }, { "advantage_max": 1.533097319304943, "advantage_mean": -4.967054434423801e-09, "advantage_min": -0.7590233869850636, "advantage_std": 0.8262660577893257, "completion_length": 3085.5625610351562, "epoch": 0.5497142857142857, "grad_norm": 1.1266975402832031, "kl": 0.211395263671875, "lambda_div_used": 0.5, "learning_rate": 1.0395300688680625e-07, "loss": 0.0325, "reward": 0.06830573407933116, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.06830573407933116, "reward_after_std": 0.8262660764157772, "reward_before_mean": 0.6889901962131262, "reward_before_std": 0.806677432730794, "reward_change_max": 0.0006970912218093872, "reward_change_mean": -0.6206844747066498, "reward_change_min": -1.1201928928494453, "reward_change_std": 0.46400916762650013, "reward_std": 0.8262661173939705, "rewards/cosine_scaled_reward": -0.030504904687404633, "rewards/format_reward": 0.7500000111758709, "step": 481 }, { "advantage_max": 1.8319706320762634, "advantage_mean": -1.676380750881279e-08, "advantage_min": -0.6075238063931465, "advantage_std": 0.9355734586715698, "completion_length": 2757.5625762939453, "epoch": 0.5508571428571428, "grad_norm": 0.35477179288864136, "kl": 0.259674072265625, "lambda_div_used": 0.5, "learning_rate": 1.0354838440848501e-07, "loss": 0.0204, "reward": 0.4476965293288231, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4476965293288231, "reward_after_std": 0.9355734437704086, "reward_before_mean": 1.3339410591870546, "reward_before_std": 0.6797395423054695, "reward_change_max": 0.00023727118968963623, "reward_change_mean": -0.8862445838749409, "reward_change_min": -1.44765355437994, "reward_change_std": 0.5364833064377308, "reward_std": 0.9355734586715698, "rewards/cosine_scaled_reward": 0.3023871900513768, "rewards/format_reward": 0.7291666753590107, "step": 482 }, { "advantage_max": 1.4121412485837936, "advantage_mean": 4.967053879312289e-09, "advantage_min": -0.6135230548679829, "advantage_std": 0.7367845512926579, "completion_length": 2745.666778564453, "epoch": 0.552, "grad_norm": 1.3282009363174438, "kl": 0.23053741455078125, "lambda_div_used": 0.5, "learning_rate": 1.0316552135205837e-07, "loss": -0.018, "reward": 0.031401316984556615, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.031401316984556615, "reward_after_std": 0.7367845550179482, "reward_before_mean": 0.6348350569605827, "reward_before_std": 0.6336484104394913, "reward_change_max": 0.0004684925079345703, "reward_change_mean": -0.6034337263554335, "reward_change_min": -1.0589254647493362, "reward_change_std": 0.3971208855509758, "reward_std": 0.7367845699191093, "rewards/cosine_scaled_reward": -0.12008249387145042, "rewards/format_reward": 0.8750000111758709, "step": 483 }, { "advantage_max": 1.4934806898236275, "advantage_mean": -9.313226356777449e-09, "advantage_min": -0.7406910136342049, "advantage_std": 0.7948021329939365, "completion_length": 2448.104202270508, "epoch": 0.5531428571428572, "grad_norm": 0.9353572726249695, "kl": 0.18560791015625, "lambda_div_used": 0.5, "learning_rate": 1.0280443637773163e-07, "loss": 0.0357, "reward": 0.12793250009417534, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.12793250009417534, "reward_after_std": 0.7948021404445171, "reward_before_mean": 0.8039913112297654, "reward_before_std": 0.7426050752401352, "reward_change_max": 0.001218445599079132, "reward_change_mean": -0.6760587878525257, "reward_change_min": -1.1402862071990967, "reward_change_std": 0.46075804345309734, "reward_std": 0.7948021776974201, "rewards/cosine_scaled_reward": 0.058245645835995674, "rewards/format_reward": 0.687500013038516, "step": 484 }, { "advantage_max": 1.35654865950346, "advantage_mean": 6.829698917520943e-09, "advantage_min": -0.48635658249258995, "advantage_std": 0.6858345717191696, "completion_length": 2498.666763305664, "epoch": 0.5542857142857143, "grad_norm": 0.23835696280002594, "kl": 0.240142822265625, "lambda_div_used": 0.5, "learning_rate": 1.0246514708427701e-07, "loss": 0.0352, "reward": 0.0007673171348869801, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.0007673171348869801, "reward_after_std": 0.6858346164226532, "reward_before_mean": 0.5846241675317287, "reward_before_std": 0.48853123001754284, "reward_change_max": 0.0003979206085205078, "reward_change_mean": -0.5838568340986967, "reward_change_min": -0.8816848546266556, "reward_change_std": 0.33762601763010025, "reward_std": 0.6858346238732338, "rewards/cosine_scaled_reward": -0.13477126159705222, "rewards/format_reward": 0.8541666828095913, "step": 485 }, { "advantage_max": 1.3995477855205536, "advantage_mean": -3.104408841103634e-09, "advantage_min": -0.6553352884948254, "advantage_std": 0.7347025461494923, "completion_length": 2372.375045776367, "epoch": 0.5554285714285714, "grad_norm": 0.4456922709941864, "kl": 0.206756591796875, "lambda_div_used": 0.5, "learning_rate": 1.0214767000817596e-07, "loss": 0.0034, "reward": 0.10065419168677181, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.10065419168677181, "reward_after_std": 0.7347025536000729, "reward_before_mean": 0.767020778497681, "reward_before_std": 0.6293009147047997, "reward_change_max": 0.0004982724785804749, "reward_change_mean": -0.6663665995001793, "reward_change_min": -1.0780046060681343, "reward_change_std": 0.4334941878914833, "reward_std": 0.7347025647759438, "rewards/cosine_scaled_reward": -0.04357295297086239, "rewards/format_reward": 0.854166679084301, "step": 486 }, { "advantage_max": 1.5991999804973602, "advantage_mean": -9.31322596819939e-09, "advantage_min": -0.6759998686611652, "advantage_std": 0.839877612888813, "completion_length": 2050.2083892822266, "epoch": 0.5565714285714286, "grad_norm": 0.4651009440422058, "kl": 0.16881561279296875, "lambda_div_used": 0.5, "learning_rate": 1.0185202062281336e-07, "loss": 0.0254, "reward": 0.2241103844717145, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2241103844717145, "reward_after_std": 0.8398776091635227, "reward_before_mean": 0.9572561159729958, "reward_before_std": 0.7173593416810036, "reward_change_max": 0.0006580352783203125, "reward_change_mean": -0.7331457510590553, "reward_change_min": -1.2133165672421455, "reward_change_std": 0.47992801666259766, "reward_std": 0.8398776240646839, "rewards/cosine_scaled_reward": 0.051544721238315105, "rewards/format_reward": 0.8541666716337204, "step": 487 }, { "advantage_max": 1.4238049387931824, "advantage_mean": -3.1044086745701804e-09, "advantage_min": -0.6612549461424351, "advantage_std": 0.7586982101202011, "completion_length": 2104.1042404174805, "epoch": 0.5577142857142857, "grad_norm": 0.2961161434650421, "kl": 0.1575164794921875, "lambda_div_used": 0.5, "learning_rate": 1.0157821333772304e-07, "loss": 0.0166, "reward": -0.05783984065055847, "reward_advantage_correlation": 0.9999999999999998, "reward_after_mean": -0.05783984065055847, "reward_after_std": 0.7586982138454914, "reward_before_mean": 0.46660646225791425, "reward_before_std": 0.7264276705682278, "reward_change_max": 0.0007160604000091553, "reward_change_mean": -0.5244462713599205, "reward_change_min": -0.9499593526124954, "reward_change_std": 0.38819571398198605, "reward_std": 0.7586982510983944, "rewards/cosine_scaled_reward": -0.14169678711914457, "rewards/format_reward": 0.7500000074505806, "step": 488 }, { "advantage_max": 1.3053287342190742, "advantage_mean": 1.3659398168108794e-08, "advantage_min": -0.6531180515885353, "advantage_std": 0.701459277421236, "completion_length": 3237.187530517578, "epoch": 0.5588571428571428, "grad_norm": 0.876398503780365, "kl": 0.3499755859375, "lambda_div_used": 0.5, "learning_rate": 1.013262614978859e-07, "loss": 0.0806, "reward": -0.1897954777814448, "reward_advantage_correlation": 1.0, "reward_after_mean": -0.1897954777814448, "reward_after_std": 0.7014592699706554, "reward_before_mean": 0.24803152214735746, "reward_before_std": 0.7108192816376686, "reward_change_max": 0.0005951672792434692, "reward_change_mean": -0.437827005982399, "reward_change_min": -0.8422755375504494, "reward_change_std": 0.35589798726141453, "reward_std": 0.7014592848718166, "rewards/cosine_scaled_reward": -0.13640091847628355, "rewards/format_reward": 0.5208333414047956, "step": 489 }, { "advantage_max": 2.04465813934803, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.7276268340647221, "advantage_std": 1.0391230322420597, "completion_length": 2317.2500915527344, "epoch": 0.56, "grad_norm": 0.8721774220466614, "kl": 0.17469024658203125, "lambda_div_used": 0.5, "learning_rate": 1.0109617738307911e-07, "loss": 0.0357, "reward": 0.2616567127406597, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.2616567127406597, "reward_after_std": 1.0391229949891567, "reward_before_mean": 0.9548911787569523, "reward_before_std": 0.8656030613929033, "reward_change_max": 0.0004154816269874573, "reward_change_mean": -0.693234434351325, "reward_change_min": -1.169628955423832, "reward_change_std": 0.4441659040749073, "reward_std": 1.039123009890318, "rewards/cosine_scaled_reward": 0.039945571683347225, "rewards/format_reward": 0.8750000037252903, "step": 490 }, { "advantage_max": 1.7204342857003212, "advantage_mean": -2.483526828633842e-09, "advantage_min": -0.8566758520901203, "advantage_std": 0.9151424802839756, "completion_length": 2481.4375762939453, "epoch": 0.5611428571428572, "grad_norm": 1.3411248922348022, "kl": 0.231109619140625, "lambda_div_used": 0.5, "learning_rate": 1.0088797220727779e-07, "loss": 0.0341, "reward": 0.4715647688135505, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.4715647688135505, "reward_after_std": 0.9151424951851368, "reward_before_mean": 1.3851352967321873, "reward_before_std": 0.795163257047534, "reward_change_max": 0.0, "reward_change_mean": -0.9135705418884754, "reward_change_min": -1.4265716075897217, "reward_change_std": 0.580524630844593, "reward_std": 0.9151425138115883, "rewards/cosine_scaled_reward": 0.3071509785950184, "rewards/format_reward": 0.7708333507180214, "step": 491 }, { "advantage_max": 1.320159673690796, "advantage_mean": -8.071462720415923e-09, "advantage_min": -0.6564004346728325, "advantage_std": 0.710913211107254, "completion_length": 2465.729232788086, "epoch": 0.5622857142857143, "grad_norm": 0.4086810350418091, "kl": 0.2469635009765625, "lambda_div_used": 0.5, "learning_rate": 1.0070165611810855e-07, "loss": 0.0104, "reward": 0.11124568968079984, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.11124568968079984, "reward_after_std": 0.7109132148325443, "reward_before_mean": 0.7901245001703501, "reward_before_std": 0.6355638056993484, "reward_change_max": 0.0, "reward_change_mean": -0.6788788326084614, "reward_change_min": -1.1075879484415054, "reward_change_std": 0.44137519784271717, "reward_std": 0.7109132558107376, "rewards/cosine_scaled_reward": -0.0007710885256528854, "rewards/format_reward": 0.791666679084301, "step": 492 }, { "advantage_max": 2.035825029015541, "advantage_mean": -3.725290298461914e-09, "advantage_min": -0.8803831040859222, "advantage_std": 1.061902403831482, "completion_length": 2251.6459045410156, "epoch": 0.5634285714285714, "grad_norm": 1.2623525857925415, "kl": 0.1896820068359375, "lambda_div_used": 0.5, "learning_rate": 1.005372381963547e-07, "loss": -0.0228, "reward": 0.39003518410027027, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.39003518410027027, "reward_after_std": 1.061902403831482, "reward_before_mean": 1.1865560039877892, "reward_before_std": 0.9536248818039894, "reward_change_max": 0.0007629022002220154, "reward_change_mean": -0.7965207956731319, "reward_change_min": -1.3428436070680618, "reward_change_std": 0.5259374044835567, "reward_std": 1.0619024187326431, "rewards/cosine_scaled_reward": 0.11411132011562586, "rewards/format_reward": 0.9583333432674408, "step": 493 }, { "advantage_max": 1.9328140318393707, "advantage_mean": -4.96705393482344e-09, "advantage_min": -0.8767299056053162, "advantage_std": 1.0182598046958447, "completion_length": 1785.6250610351562, "epoch": 0.5645714285714286, "grad_norm": 1.4499907493591309, "kl": 0.122161865234375, "lambda_div_used": 0.5, "learning_rate": 1.0039472645551372e-07, "loss": 0.0295, "reward": 0.3418647423386574, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.3418647423386574, "reward_after_std": 1.018259834498167, "reward_before_mean": 1.1130484715104103, "reward_before_std": 0.9386914111673832, "reward_change_max": 0.0, "reward_change_mean": -0.7711836881935596, "reward_change_min": -1.3634328842163086, "reward_change_std": 0.525868522003293, "reward_std": 1.0182598493993282, "rewards/cosine_scaled_reward": 0.09819087269715965, "rewards/format_reward": 0.916666679084301, "step": 494 }, { "advantage_max": 1.592846192419529, "advantage_mean": -7.450581041013038e-09, "advantage_min": -0.6558383330702782, "advantage_std": 0.831727247685194, "completion_length": 2725.3333587646484, "epoch": 0.5657142857142857, "grad_norm": 0.35111257433891296, "kl": 0.252655029296875, "lambda_div_used": 0.5, "learning_rate": 1.002741278414069e-07, "loss": 0.0314, "reward": 0.0912869069725275, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.0912869069725275, "reward_after_std": 0.8317272700369358, "reward_before_mean": 0.720516414847225, "reward_before_std": 0.7226469293236732, "reward_change_max": 0.0018553584814071655, "reward_change_mean": -0.6292295139282942, "reward_change_min": -1.1326270997524261, "reward_change_std": 0.4294526055455208, "reward_std": 0.8317273035645485, "rewards/cosine_scaled_reward": 0.01650819112546742, "rewards/format_reward": 0.6875000055879354, "step": 495 }, { "advantage_max": 1.5743544548749924, "advantage_mean": -9.313226467799751e-09, "advantage_min": -0.7295497246086597, "advantage_std": 0.8451073318719864, "completion_length": 1919.1875228881836, "epoch": 0.5668571428571428, "grad_norm": 0.410314679145813, "kl": 0.2297515869140625, "lambda_div_used": 0.5, "learning_rate": 1.0017544823184055e-07, "loss": 0.0128, "reward": 0.12698657670989633, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.12698657670989633, "reward_after_std": 0.8451073467731476, "reward_before_mean": 0.7774940053932369, "reward_before_std": 0.7901693303138018, "reward_change_max": 0.0, "reward_change_mean": -0.6505074352025986, "reward_change_min": -1.152572087943554, "reward_change_std": 0.47371556237339973, "reward_std": 0.8451073616743088, "rewards/cosine_scaled_reward": 0.003330339677631855, "rewards/format_reward": 0.7708333469927311, "step": 496 }, { "advantage_max": 1.628432959318161, "advantage_mean": -2.4835269396561444e-09, "advantage_min": -0.7225891537964344, "advantage_std": 0.8677199482917786, "completion_length": 2582.0625762939453, "epoch": 0.568, "grad_norm": 0.6326491236686707, "kl": 0.25273895263671875, "lambda_div_used": 0.5, "learning_rate": 1.0009869243631952e-07, "loss": 0.0302, "reward": 0.22483675926923752, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.22483675926923752, "reward_after_std": 0.8677199482917786, "reward_before_mean": 0.964297803118825, "reward_before_std": 0.7687317673116922, "reward_change_max": 0.0022812560200691223, "reward_change_mean": -0.7394610624760389, "reward_change_min": -1.3346537351608276, "reward_change_std": 0.5281522907316685, "reward_std": 0.8677199706435204, "rewards/cosine_scaled_reward": 0.12798223458230495, "rewards/format_reward": 0.7083333395421505, "step": 497 }, { "advantage_max": 1.3664054870605469, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.7810567021369934, "advantage_std": 0.7454097159206867, "completion_length": 2571.291732788086, "epoch": 0.5691428571428572, "grad_norm": 0.7533896565437317, "kl": 0.250091552734375, "lambda_div_used": 0.5, "learning_rate": 1.000438641958131e-07, "loss": 0.026, "reward": 0.13466812949627638, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.13466812949627638, "reward_after_std": 0.7454097084701061, "reward_before_mean": 0.8310234602540731, "reward_before_std": 0.7226423937827349, "reward_change_max": 0.0, "reward_change_mean": -0.6963553391396999, "reward_change_min": -1.16697296500206, "reward_change_std": 0.4730456341058016, "reward_std": 0.7454097121953964, "rewards/cosine_scaled_reward": -0.0428216177970171, "rewards/format_reward": 0.916666679084301, "step": 498 }, { "advantage_max": 2.1150874942541122, "advantage_mean": -1.7384688355548406e-08, "advantage_min": -0.8328440636396408, "advantage_std": 1.0995317697525024, "completion_length": 2640.3750762939453, "epoch": 0.5702857142857143, "grad_norm": 0.7087387442588806, "kl": 0.32720947265625, "lambda_div_used": 0.5, "learning_rate": 1.0001096618257236e-07, "loss": 0.0227, "reward": 0.1912521708291024, "reward_advantage_correlation": 1.0, "reward_after_mean": 0.1912521708291024, "reward_after_std": 1.0995317697525024, "reward_before_mean": 0.8074039425700903, "reward_before_std": 1.0485225953161716, "reward_change_max": 0.00030355900526046753, "reward_change_mean": -0.6161517985165119, "reward_change_min": -1.195886768400669, "reward_change_std": 0.46653415262699127, "reward_std": 1.099531814455986, "rewards/cosine_scaled_reward": 0.007868630811572075, "rewards/format_reward": 0.791666679084301, "step": 499 }, { "advantage_max": 1.8856936693191528, "advantage_mean": 2.4835269396561444e-09, "advantage_min": -0.7711785212159157, "advantage_std": 0.9831021875143051, "completion_length": 2979.229248046875, "epoch": 0.5714285714285714, "grad_norm": 0.687682569026947, "kl": 0.3313140869140625, "lambda_div_used": 0.5, "learning_rate": 1e-07, "loss": 0.0507, "reward": 0.009953925851732492, "reward_advantage_correlation": 0.9999999999999999, "reward_after_mean": 0.009953925851732492, "reward_after_std": 0.9831021577119827, "reward_before_mean": 0.5228100651875138, "reward_before_std": 0.9547398835420609, "reward_change_max": 0.0015029683709144592, "reward_change_mean": -0.5128561519086361, "reward_change_min": -1.1185838133096695, "reward_change_std": 0.4156857579946518, "reward_std": 0.9831021949648857, "rewards/cosine_scaled_reward": -0.07192830881103873, "rewards/format_reward": 0.6666666846722364, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.0024257735135033726, "train_runtime": 9466.1521, "train_samples_per_second": 2.535, "train_steps_per_second": 0.053 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }